--- /dev/null
+# Generated with cmake-format 0.5.4
+# --------------------------
+# General Formatting Options
+# --------------------------
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# If the statement spelling length (including space and parenthesis is larger
+# than the tab width by more than this amoung, then force reject un-nested
+# layouts.
+max_prefix_chars = 2
+
+# If a candidate layout is wrapped horizontally but it exceeds this many lines,
+# then reject the layout.
+max_lines_hwrap = 2
+
+# What style line endings to use in the output.
+line_ending = 'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = 'lower'
+
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = 'unchanged'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+ "foo": {
+ "flags": [
+ "BAR",
+ "BAZ"
+ ],
+ "kwargs": {
+ "HEADERS": "*",
+ "SOURCES": "*",
+ "DEPENDS": "*"
+ }
+ }
+}
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+enable_sort = False
+
+# If true, the parsers may infer whether or not an argument list is sortable
+# (without annotation).
+autosort = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}
+
+
+# --------------------------
+# Comment Formatting Options
+# --------------------------
+# What character to use for bulleted lists
+bullet_char = '*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = '.'
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in each
+# listfile. Use this to preserve formatting of your copyright/license
+# statements.
+first_comment_is_literal = True
+
+# If comment markup is enabled, don't reflow any comment block which matches
+# this (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+
+# ---------------------------------
+# Miscellaneous Options
+# ---------------------------------
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = 'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = 'utf-8'
--- /dev/null
+* whitespace=tab-in-indent,space-before-tab,trailing-space
--- /dev/null
+/build
+/third_party
--- /dev/null
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder. To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+# libgav1 requires C99.
+set(CMAKE_C_STANDARD 99)
+
+project(libgav1 CXX C)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+ message(
+ FATAL_ERROR
+ "Building from within the libgav1 source tree is not supported.\n"
+ "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+ "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+ "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+ "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+ "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE
+ ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
+libgav1_option(
+ NAME LIBGAV1_VERBOSE HELPSTRING
+ "Enables verbose build system output. Higher numbers are more verbose." VALUE
+ OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+get_property(use_folders_is_set GLOBAL PROPERTY USE_FOLDERS SET)
+if(NOT use_folders_is_set)
+ set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
+endif()
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+if((DEFINED
+ LIBGAV1_THREADPOOL_USE_STD_MUTEX
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS))
+ set(use_absl_threading TRUE)
+endif()
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(EXISTS "${libgav1_abseil}")
+ set(ABSL_PROPAGATE_CXX_STD ON)
+ add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}"
+ EXCLUDE_FROM_ALL)
+else()
+ if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS)
+ message(
+ FATAL_ERROR
+ "Abseil not found. This dependency is required by the"
+ " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+ " not defined. To continue, download the Abseil repository to"
+ " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n"
+ " clone -b 20220623.0 --depth 1 \\\n"
+ " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+ endif()
+endif()
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_tests_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_ENABLE_TESTS)
+ # include(CTest) or -DBUILD_TESTING=1 aren't used to avoid enabling abseil
+ # tests.
+ enable_testing()
+endif()
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
--- /dev/null
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
--- /dev/null
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
--- /dev/null
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0), High profile (1) & Professional profile (2)
+compliant AV1 decoder. More information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1. A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+ recommended.
+
+2. [CMake >= 3.7.1](https://cmake.org/download/)
+
+3. [Abseil](https://abseil.io)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone -b 20220623.0 --depth 1 \
+ https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ ```
+
+ Note: Abseil is required by the examples and tests. libgav1 will depend on
+ it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4. (Optional) [GoogleTest](https://github.com/google/googletest)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone -b release-1.12.1 --depth 1 \
+ https://github.com/google/googletest.git third_party/googletest
+ ```
+
+### Compile
+
+```shell
+ $ mkdir build && cd build
+ $ cmake -G "Unix Makefiles" ..
+ $ make
+```
+
+Configuration options:
+
+* `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10, 12;
+ default: 12).
+* `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+ [symbol reduction](#symbol-reduction) in an optimized build to keep all
+ versions of dsp functions available. Automatically defined in
+ `src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+* `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+* `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+ setting this to 0 will also disable AVX2.
+* `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+ Automatically defined in `src/utils/logging.h` if unset.
+* `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+ the examples. Automatically defined in `examples/logging.h` if unset.
+* `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+ coefficient range checks.
+* `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+ LogSeverity` in `src/utils/logging.h`. Automatically defined in
+ `src/utils/logging.cc` if unset.
+* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+ absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+ dependency from the core library. Automatically defined in
+ `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+ otherwise.
+* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+ default value is 128.
+* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+ is used to determine when to use frame parallel decoding. Frame parallel
+ decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+ be an integer > 0. The default value is 4. This is an advanced setting
+ intended for testing purposes.
+
+For additional options see:
+
+```shell
+ $ cmake .. -LH
+```
+
+## Testing
+
+* `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+ options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+ convert other container formats to IVF.
+
+* Unit tests are built when `LIBGAV1_ENABLE_TESTS` is set to `1`. The binaries
+ can be invoked directly or with
+ [`ctest`](https://cmake.org/cmake/help/latest/manual/ctest.1.html).
+
+ * The test input location can be given by setting the
+ `LIBGAV1_TEST_DATA_PATH` environment variable; it defaults to
+ `<libgav1_src>/tests/data`, where `<libgav1_src>` is `/data/local/tmp`
+ on Android platforms or the source directory configured with cmake
+ otherwise.
+
+ * Output is written to the value of the `TMPDIR` or `TEMP` environment
+ variables in that order if set, otherwise `/data/local/tmp` on Android
+ platforms, the value of `LIBGAV1_FLAGS_TMPDIR` if defined during
+ compilation or the current directory if not.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+* `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+ This handles cpu-detection and initializing each logical unit which populate
+ `libgav1::dsp::Dsp` function tables.
+* `src/dsp/dsp.h` contains function and type definitions for all logical units
+ (e.g., intra-predictors)
+* `src/utils/cpu.h` contains definitions for cpu-detection
+* base implementations are located in `src/dsp/*.{h,cc}` with platform
+ specific optimizations in sub-folders
+* unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+ functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+ #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+ #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+ #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ ...
+
+ // In unoptimized code use the following structure; there's no equivalent
+ // define for LIBGAV1_CPU_C as it would require duplicating the function
+ // defines used in optimized code for only a small benefit to this
+ // boilerplate.
+ #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ ...
+ #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+ ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
--- /dev/null
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
--- /dev/null
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+ string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+ libgav1_load_version_info()
+
+ # Library version info. See the libtool docs for updating the values:
+ # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+ #
+ # c=<current>, r=<revision>, a=<age>
+ #
+ # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+ # passed to libtool.
+ #
+ # We set LIBGAV1_SOVERSION = [c-a].a.r
+ set(LT_CURRENT 1)
+ set(LT_REVISION 0)
+ set(LT_AGE 0)
+ math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+ set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+ unset(LT_CURRENT)
+ unset(LT_REVISION)
+ unset(LT_AGE)
+
+ list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+ "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+ list(APPEND libgav1_gtest_include_paths
+ "third_party/googletest/googlemock/include"
+ "third_party/googletest/googletest/include"
+ "third_party/googletest/googletest")
+ list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths})
+ list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+ "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+ "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+ if(MSVC OR WIN32)
+ list(APPEND libgav1_defines "_CRT_SECURE_NO_WARNINGS" "NOMINMAX"
+ "_SCL_SECURE_NO_WARNINGS")
+ endif()
+
+ if(ANDROID)
+ if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+ set(CMAKE_ANDROID_ARM_MODE ON)
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+ endif()
+ endif()
+
+ list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+ "-Wno-sign-compare" "-fvisibility=hidden"
+ "-fvisibility-inlines-hidden")
+
+ if(BUILD_SHARED_LIBS)
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+ set(libgav1_dependency libgav1_shared)
+ else()
+ set(libgav1_dependency libgav1_static)
+ endif()
+
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+ "-Wshorten-64-to-32")
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+ # Quiet warnings in copy-list-initialization where {} elision has always
+ # been allowed.
+ list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+ endif()
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+ endif()
+ endif()
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+ # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+ list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+ # Quiet gcc 6 vs 7 abi warnings:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+ list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+ list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+ endif()
+ endif()
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
+ endif()
+
+ list(APPEND libgav1_msvc_cxx_flags
+ # Warning level 3.
+ "/W3"
+ # Disable warning C4018:
+ # '<comparison operator>' signed/unsigned mismatch
+ "/wd4018"
+ # Disable warning C4244:
+ # 'argument': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4244"
+ # Disable warning C4267:
+ # '=': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4267"
+ # Disable warning C4309:
+ # 'argument': truncation of constant value
+ "/wd4309"
+ # Disable warning C4551:
+ # function call missing argument list
+ "/wd4551")
+
+ if(BUILD_SHARED_LIBS)
+ list(APPEND libgav1_msvc_cxx_flags
+ # Disable warning C4251:
+ # 'libgav1::DecoderImpl class member' needs to have
+ # dll-interface to be used by clients of class
+ # 'libgav1::Decoder'.
+ "/wd4251")
+ endif()
+
+ if(NOT LIBGAV1_MAX_BITDEPTH)
+ set(LIBGAV1_MAX_BITDEPTH 12)
+ elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8
+ AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10
+ AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12)
+ libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.")
+ endif()
+
+ list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+ if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+ libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+ endif()
+
+ list(APPEND libgav1_defines
+ "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+ endif()
+
+ # Source file names ending in these suffixes will have the appropriate
+ # compiler flags added to their compile commands to enable intrinsics.
+ set(libgav1_avx2_source_file_suffix "avx2(_test)?.cc")
+ set(libgav1_neon_source_file_suffix "neon(_test)?.cc")
+ set(libgav1_sse4_source_file_suffix "sse4(_test)?.cc")
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+ if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+ if(cpu_lowercase MATCHES "^arm|^aarch64")
+ set(libgav1_have_neon ON)
+ elseif(cpu_lowercase MATCHES "^x86|amd64")
+ set(libgav1_have_avx2 ON)
+ set(libgav1_have_sse4 ON)
+ endif()
+ endif()
+
+ if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+ set(libgav1_have_avx2 OFF)
+ endif()
+
+ if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+ set(libgav1_have_neon, OFF)
+ endif()
+
+ if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+ set(libgav1_have_sse4 OFF)
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+ unset(compiler_SOURCES)
+ unset(compiler_FLAGS)
+ unset(optional_args)
+ unset(single_value_args)
+ set(multi_value_args SOURCES FLAGS)
+ cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (compiler_SOURCES AND compiler_FLAGS))
+ libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+ "FLAGS required.")
+ endif()
+
+ set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+ ${compiler_FLAGS})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ foreach(source ${compiler_SOURCES})
+ foreach(flag ${compiler_FLAGS})
+ message("libgav1_set_compiler_flags_for_sources: source:${source} "
+ "flag:${flag}")
+ endforeach()
+ endforeach()
+ endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+# [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+ unset(cxx_test_FLAG_LIST_VAR_NAMES)
+ unset(cxx_test_FLAG_REQUIRED)
+ unset(single_value_args)
+ set(optional_args FLAG_REQUIRED)
+ set(multi_value_args FLAG_LIST_VAR_NAMES)
+ cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+ libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+ endif()
+
+ unset(cxx_flags)
+ foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+ endif()
+ list(APPEND cxx_flags ${${list_var}})
+ endforeach()
+
+ if(LIBGAV1_VERBOSE)
+ message("CXX test: all flags: ${cxx_flags}")
+ endif()
+
+ unset(all_cxx_flags)
+ list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ # Run the actual compile test.
+ unset(libgav1_all_cxx_flags_pass CACHE)
+ message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+ check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+ if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+ libgav1_die("Flag test failed for required flag(s): "
+ "${all_cxx_flags} and FLAG_REQUIRED specified.")
+ endif()
+
+ if(libgav1_all_cxx_flags_pass)
+ # Test passed: update the global flag list used by the libgav1 target
+ # creation wrappers.
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+ if(LIBGAV1_VERBOSE)
+ message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+ endif()
+
+ message("--- Passed combined CXX flags test")
+ else()
+ message("--- Failed combined CXX flags test, testing flags individually.")
+
+ if(cxx_flags)
+ message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+ foreach(cxx_flag ${cxx_flags})
+ # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+ # variable at parent scope while check_cxx_source_compiles() continues
+ # to set an internal cache variable, so we unset both to avoid the
+ # failure / success state persisting between checks. See
+ # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+ unset(cxx_flag_test_passed)
+ unset(cxx_flag_test_passed CACHE)
+ message("--- Testing flag: ${cxx_flag}")
+ check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+ if(cxx_flag_test_passed)
+ message("--- Passed test for ${cxx_flag}")
+ else()
+ list(REMOVE_ITEM cxx_flags ${cxx_flag})
+ message("--- Failed test for ${cxx_flag}, flag removed.")
+ endif()
+ endforeach()
+
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+ endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+ unset(link_FLAG_LIST_VAR_NAME)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args FLAG_LIST_VAR_NAME)
+ cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT link_FLAG_LIST_VAR_NAME)
+ libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+ endif()
+
+ libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+ ${link_FLAG_LIST_VAR_NAME})
+
+ if(LIBGAV1_VERBOSE)
+ message("EXE LINKER test: all flags: ${linker_flags}")
+ endif()
+
+ # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+ # linker test.
+ libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+ LIBGAV1_CXX_FLAGS)
+
+ # Cache the global exe linker flags.
+ if(CMAKE_EXE_LINKER_FLAGS)
+ set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+ ${linker_flags})
+ endif()
+
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+ ${CMAKE_EXE_LINKER_FLAGS})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+ unset(linker_flag_test_passed CACHE)
+ set(libgav1_cxx_main "\nint main() { return 0; }")
+ check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+ if(NOT linker_flag_test_passed)
+ libgav1_die("EXE LINKER test failed.")
+ endif()
+
+ message("--- Passed EXE LINKER flag test.")
+
+ # Restore cached global exe linker flags.
+ if(cached_CMAKE_EXE_LINKER_FLAGS)
+ set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
+ else()
+ unset(CMAKE_EXE_LINKER_FLAGS)
+ endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+ unset(cxx_flag_lists)
+
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+ endif()
+
+ # Append clang flags after the base set to allow -Wno* overrides to take
+ # effect. Some of the base flags may enable a large set of warnings, e.g.,
+ # -Wall.
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+ endif()
+
+ if(MSVC)
+ list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+ endif()
+
+ if(LIBGAV1_VERBOSE)
+ if(cxx_flag_lists)
+ libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+ message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+ endif()
+ endif()
+
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+ if(LIBGAV1_ENABLE_TESTS)
+ set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+ list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+
+ if(NOT CMAKE_CXX_COMPILER_ID STREQUAL CMAKE_C_COMPILER_ID)
+ message(
+ FATAL_ERROR
+ "C/CXX compiler mismatch (${CMAKE_C_COMPILER_ID} vs"
+ " ${CMAKE_CXX_COMPILER_ID})! Compiler flags are only tested using"
+ " CMAKE_CXX_COMPILER, rerun cmake with CMAKE_C_COMPILER set to the"
+ " C compiler from the same package as CMAKE_CXX_COMPILER to ensure"
+ " the build completes successfully.")
+ endif()
+ set(LIBGAV1_TEST_C_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+ list(FILTER LIBGAV1_TEST_C_FLAGS EXCLUDE REGEX
+ "-fvisibility-inlines-hidden")
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+ # macro parameters are not variables so a temporary is needed to work with
+ # list().
+ set(msg ${ARGN})
+ # message(${ARGN}) will merge all list elements with no separator while
+ # "${ARGN}" will output the list as a ';' delimited string.
+ list(JOIN msg " " msg)
+ message(FATAL_ERROR "${msg}")
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+ set(optional_args)
+ set(single_value_args DEST SOURCE_VAR)
+ set(multi_value_args SOURCE SOURCE_VARS)
+ cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+ libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+ "SOURCE_VARS required.")
+ endif()
+
+ unset(${sas_DEST})
+
+ if(sas_SOURCE)
+ # $sas_SOURCE is one or more expanded variables, just copy the values to
+ # $sas_DEST.
+ set(${sas_DEST} "${sas_SOURCE}")
+ endif()
+
+ if(sas_SOURCE_VARS)
+ # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+ # variable and appends it to $sas_DEST.
+ foreach(source_var ${sas_SOURCE_VARS})
+ set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+ endforeach()
+
+ # Because $sas_DEST can be empty when entering this scope leading whitespace
+ # can be introduced to $sas_DEST on the first iteration of the above loop.
+ # Remove it:
+ string(STRIP "${${sas_DEST}}" ${sas_DEST})
+ endif()
+
+ # Lists in CMake are simply semicolon delimited strings, so stringification is
+ # just a find and replace of the semicolon.
+ string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+ endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+ set(optional_args)
+ set(single_value_args TARGET BASENAME LISTVAR)
+ set(multi_value_args)
+ cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+ libgav1_die(
+ "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+ endif()
+
+ if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+ set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+ endif()
+
+ set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+ set(dummy_source_file
+ "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+ set(dummy_source_code
+ "// Generated file. DO NOT EDIT!\n"
+ "// C++ source file created for target ${cdsf_TARGET}.\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+ file(WRITE "${dummy_source_file}" ${dummy_source_code})
+
+ target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+ if(cdsf_LISTVAR)
+ list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+ endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+# - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+ file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+ foreach(str ${version_file_strings})
+ if(str MATCHES "#define LIBGAV1_")
+ if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+ string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+ "${str}")
+ endif()
+ endif()
+ endforeach()
+ set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+ set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+ if(NOT (MSVC OR XCODE))
+ include(GNUInstallDirs)
+
+ # pkg-config: libgav1.pc
+ set(prefix "${CMAKE_INSTALL_PREFIX}")
+ set(exec_prefix "\${prefix}")
+ set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+ set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ set(libgav1_lib_name "libgav1")
+
+ configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+ "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+ install(FILES "${libgav1_build}/libgav1.pc"
+ DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+ # CMake config: libgav1-config.cmake
+ set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+ "${libgav1_build}/libgav1-config.cmake" @ONLY
+ NEWLINE_STYLE UNIX)
+ install(
+ FILES "${libgav1_build}/libgav1-config.cmake"
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+ install(
+ FILES ${libgav1_api_includes}
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+ if(LIBGAV1_ENABLE_EXAMPLES)
+ install(TARGETS gav1_decode DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+ endif()
+ install(TARGETS libgav1_static DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ if(BUILD_SHARED_LIBS)
+ install(TARGETS libgav1_shared DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ endif()
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+ unset(intrinsics_SUFFIX)
+ unset(intrinsics_VARIABLE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args SUFFIX VARIABLE)
+ cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+ "VARIABLE required.")
+ endif()
+
+ if(intrinsics_SUFFIX MATCHES "neon")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+ endif()
+ elseif(intrinsics_SUFFIX MATCHES "avx2")
+ if(MSVC)
+ set(${intrinsics_VARIABLE} "/arch:AVX2")
+ else()
+ set(${intrinsics_VARIABLE} "-mavx2")
+ endif()
+ elseif(intrinsics_SUFFIX MATCHES "sse4")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "-msse4.1")
+ endif()
+ else()
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+ "instrinics suffix: ${intrinsics_SUFFIX}")
+ endif()
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_get_intrinsics_flag_for_suffix: "
+ "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+ endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports AVX2 and SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+ unset(arg_TARGET)
+ unset(arg_SOURCES)
+ unset(optional_args)
+ set(single_value_args TARGET)
+ set(multi_value_args SOURCES)
+ cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+ if(NOT (arg_TARGET AND arg_SOURCES))
+ message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+ "SOURCES required.")
+ endif()
+
+ if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+ unset(avx2_sources)
+ list(APPEND avx2_sources ${arg_SOURCES})
+
+ list(FILTER avx2_sources INCLUDE REGEX
+ "${libgav1_avx2_source_file_suffix}$")
+
+ if(avx2_sources)
+ unset(avx2_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_avx2_source_file_suffix}
+ VARIABLE avx2_flags)
+ if(avx2_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+ ${avx2_flags})
+ endif()
+ endif()
+ endif()
+
+ if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+ unset(sse4_sources)
+ list(APPEND sse4_sources ${arg_SOURCES})
+
+ list(FILTER sse4_sources INCLUDE REGEX
+ "${libgav1_sse4_source_file_suffix}$")
+
+ if(sse4_sources)
+ unset(sse4_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_sse4_source_file_suffix}
+ VARIABLE sse4_flags)
+ if(sse4_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+ ${sse4_flags})
+ endif()
+ endif()
+ endif()
+
+ if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+ unset(neon_sources)
+ list(APPEND neon_sources ${arg_SOURCES})
+ list(FILTER neon_sources INCLUDE REGEX
+ "${libgav1_neon_source_file_suffix}$")
+
+ if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+ unset(neon_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_neon_source_file_suffix}
+ VARIABLE neon_flags)
+ if(neon_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+ ${neon_flags})
+ endif()
+ endif()
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+ unset(option_NAME)
+ unset(option_HELPSTRING)
+ unset(option_VALUE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args NAME HELPSTRING VALUE)
+ cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+ message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+ endif()
+
+ option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("--------- libgav1_option ---------\n"
+ "option_NAME=${option_NAME}\n"
+ "option_HELPSTRING=${option_HELPSTRING}\n"
+ "option_VALUE=${option_VALUE}\n"
+ "------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_options ${option_NAME})
+ list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+ foreach(option_name ${libgav1_options})
+ message("${option_name}: ${${option_name}}")
+ endforeach()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+ if(LIBGAV1_SANITIZE AND NOT MSVC)
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ if(LIBGAV1_SANITIZE MATCHES "cfi")
+ list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+ "-fuse-ld=gold")
+ endif()
+
+ if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+ AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+ endif()
+ endif()
+
+ list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+ # Make sanitizer callstacks accurate.
+ list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+ "-fno-optimize-sibling-calls")
+
+ # Check the linker flags first as they may be required in the compile check
+ # to avoid undefined symbols related to the sanitizer.
+ libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+if(LIBGAV1_IDE_FOLDER)
+ set(LIBGAV1_EXAMPLES_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/examples")
+ set(LIBGAV1_TESTS_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/tests")
+else()
+ set(LIBGAV1_EXAMPLES_IDE_FOLDER "libgav1_examples")
+ set(LIBGAV1_TESTS_IDE_FOLDER "libgav1_tests")
+endif()
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+ unset(libgav1_targets)
+ unset(libgav1_exe_targets)
+ unset(libgav1_lib_targets)
+ unset(libgav1_objlib_targets)
+ unset(libgav1_sources)
+ unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME.
+# - TEST: Flag. Presence means treat executable as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+ unset(exe_TEST)
+ unset(exe_TEST_DEFINES_MAIN)
+ unset(exe_NAME)
+ unset(exe_OUTPUT_NAME)
+ unset(exe_SOURCES)
+ unset(exe_DEFINES)
+ unset(exe_INCLUDES)
+ unset(exe_COMPILE_FLAGS)
+ unset(exe_LINK_FLAGS)
+ unset(exe_OBJLIB_DEPS)
+ unset(exe_LIB_DEPS)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS)
+
+ cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_executable ---------\n"
+ "exe_TEST=${exe_TEST}\n"
+ "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+ "exe_NAME=${exe_NAME}\n"
+ "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+ "exe_SOURCES=${exe_SOURCES}\n"
+ "exe_DEFINES=${exe_DEFINES}\n"
+ "exe_INCLUDES=${exe_INCLUDES}\n"
+ "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+ "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+ "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+ "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+ "------------------------------------------\n")
+ endif()
+
+ if(NOT (exe_NAME AND exe_SOURCES))
+ message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${exe_NAME})
+ if(exe_TEST)
+ list(APPEND libgav1_test_targets ${exe_NAME})
+ list(APPEND libgav1_test_sources ${exe_SOURCES})
+ else()
+ list(APPEND libgav1_exe_targets ${exe_NAME})
+ list(APPEND libgav1_sources ${exe_SOURCES})
+ endif()
+
+ add_executable(${exe_NAME} ${exe_SOURCES})
+ if(exe_TEST)
+ add_test(NAME ${exe_NAME} COMMAND ${exe_NAME})
+ set_property(TARGET ${exe_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+ else()
+ set_property(TARGET ${exe_NAME}
+ PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+ endif()
+
+ if(exe_OUTPUT_NAME)
+ set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+ endif()
+
+ libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+ if(exe_DEFINES)
+ target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+ endif()
+
+ if(exe_INCLUDES)
+ target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+ endif()
+
+ unset(exe_LIBGAV1_COMPILE_FLAGS)
+ if(exe_TEST)
+ list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+ list(LENGTH exe_SOURCES exe_SOURCES_length)
+ if(exe_SOURCES_length EQUAL 0)
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+ endif()
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
+ target_compile_options(${exe_NAME}
+ PRIVATE ${exe_COMPILE_FLAGS}
+ ${exe_LIBGAV1_COMPILE_FLAGS})
+ endif()
+
+ if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+ list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+ if(${CMAKE_VERSION} VERSION_LESS "3.13")
+ # LINK_FLAGS is managed as a string.
+ libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+ set_target_properties(${exe_NAME}
+ PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+ else()
+ target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+ endif()
+ endif()
+
+ if(exe_OBJLIB_DEPS)
+ foreach(objlib_dep ${exe_OBJLIB_DEPS})
+ target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(CMAKE_THREAD_LIBS_INIT)
+ list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+
+ if(exe_LIB_DEPS)
+ unset(exe_static)
+ if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+ set(exe_static ON)
+ endif()
+
+ if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # Third party dependencies can introduce dependencies on system and test
+ # libraries. Since the target created here is an executable, and CMake
+ # does not provide a method of controlling order of link dependencies,
+ # wrap all of the dependencies of this target in start/end group flags to
+ # ensure that dependencies of third party targets can be resolved when
+ # those dependencies happen to be resolved by dependencies of the current
+ # target.
+ list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+ list(APPEND exe_LIB_DEPS -Wl,--end-group)
+ endif()
+ target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+ endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+# is generating a build for which MSVC or WIN32 are true. This is to avoid
+# output basename collisions with DLL import libraries.
+# - TEST: Flag. Presence means treat library as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+# - When TEST is specified sources are added to $libgav1_test_sources.
+# - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+# - Targets are always added to $libgav1_targets.
+# - When the TEST flag is specified, targets are added to
+# $libgav1_test_targets.
+# - When TEST is not specified:
+# - Libraries of type SHARED are added to $libgav1_dylib_targets.
+# - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+# - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+ unset(lib_TEST)
+ unset(lib_NAME)
+ unset(lib_OUTPUT_NAME)
+ unset(lib_TYPE)
+ unset(lib_SOURCES)
+ unset(lib_DEFINES)
+ unset(lib_INCLUDES)
+ unset(lib_COMPILE_FLAGS)
+ unset(lib_LINK_FLAGS)
+ unset(lib_OBJLIB_DEPS)
+ unset(lib_LIB_DEPS)
+ unset(lib_PUBLIC_INCLUDES)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME TYPE)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+ cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_library ---------\n"
+ "lib_TEST=${lib_TEST}\n"
+ "lib_NAME=${lib_NAME}\n"
+ "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+ "lib_TYPE=${lib_TYPE}\n"
+ "lib_SOURCES=${lib_SOURCES}\n"
+ "lib_DEFINES=${lib_DEFINES}\n"
+ "lib_INCLUDES=${lib_INCLUDES}\n"
+ "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+ "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+ "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+ "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+ "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+ "---------------------------------------\n")
+ endif()
+
+ if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+ message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${lib_NAME})
+ if(lib_TEST)
+ list(APPEND libgav1_test_targets ${lib_NAME})
+ list(APPEND libgav1_test_sources ${lib_SOURCES})
+ else()
+ list(APPEND libgav1_sources ${lib_SOURCES})
+ if(lib_TYPE STREQUAL OBJECT)
+ list(APPEND libgav1_objlib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL SHARED)
+ list(APPEND libgav1_dylib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL STATIC)
+ list(APPEND libgav1_lib_targets ${lib_NAME})
+ else()
+ message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+ endif()
+ endif()
+
+ add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+ libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+ if(lib_OUTPUT_NAME)
+ if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+ set_target_properties(${lib_NAME}
+ PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+ endif()
+ endif()
+
+ if(lib_DEFINES)
+ target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+ endif()
+
+ if(lib_INCLUDES)
+ target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+ endif()
+
+ if(lib_PUBLIC_INCLUDES)
+ target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+ endif()
+
+ if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ target_compile_options(${lib_NAME}
+ PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(lib_LINK_FLAGS)
+ set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+ endif()
+
+ if(lib_OBJLIB_DEPS)
+ foreach(objlib_dep ${lib_OBJLIB_DEPS})
+ target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(lib_LIB_DEPS)
+ if(lib_TYPE STREQUAL STATIC)
+ set(link_type PUBLIC)
+ else()
+ set(link_type PRIVATE)
+ if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # The libgav1 shared object uses the static libgav1 as input to turn it
+ # into a shared object. Include everything from the static library in
+ # the shared object.
+ if(APPLE)
+ list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+ else()
+ list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+ list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+ endif()
+ endif()
+ endif()
+ target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+ endif()
+
+ if(NOT MSVC AND lib_NAME MATCHES "^lib")
+ # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+ # already includes lib in its name. Avoid naming output files liblib*.
+ set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+ endif()
+
+ if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+ set_target_properties(${lib_NAME}
+ PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+ ${LIBGAV1_SOVERSION_MAJOR})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ if(lib_TYPE STREQUAL SHARED)
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+ else()
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+ endif()
+
+ # Determine if $lib_NAME is a header only target.
+ set(sources_list ${lib_SOURCES})
+ list(FILTER sources_list INCLUDE REGEX cc$)
+ if(NOT sources_list)
+ if(NOT XCODE)
+ # This is a header only target. Tell CMake the link language.
+ set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+ else()
+ # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+ libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+ endif()
+ endif()
+
+ if(lib_TEST)
+ set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+ else()
+ set(sources_list ${lib_SOURCES})
+ list(FILTER sources_list INCLUDE REGEX examples)
+ if(sources_list)
+ set_property(TARGET ${lib_NAME}
+ PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+ else()
+ set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_IDE_FOLDER})
+ endif()
+ endif()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+ if("${variable_name}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable_name passed to libgav1_variable_must_be_directory.")
+ endif()
+
+ if("${${variable_name}}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable ${variable_name} is required to build libgav1.")
+ endif()
+
+ if(NOT IS_DIRECTORY "${${variable_name}}")
+ message(
+ FATAL_ERROR
+ "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+ "directory.")
+ endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("---- libgav1_track_configuration_variable ----\n"
+ "var_name=${var_name}\n"
+ "----------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_configuration_variables ${var_name})
+ list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+ unset(flag_variables)
+ list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+ "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+ if(CMAKE_BUILD_TYPE)
+ list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+ endif()
+ foreach(flag_variable ${flag_variables})
+ message("${flag_variable}:${${flag_variable}}")
+ endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+ foreach(config_variable ${libgav1_configuration_variables})
+ message("${config_variable}:${${config_variable}}")
+ endforeach()
+endmacro()
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS aarch64-linux-gnu-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+ set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+ set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit arm targets (instead of the default thumb) to
+# improve performance.
+if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE)
+ set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+ set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+ set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+ message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+ return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS arm-linux-gnueabihf-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of
+# gcc:
+# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
--- /dev/null
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+} // namespace
+
+bool FileReader::registered_in_factory_ =
+ FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+ if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+ const std::string& file_name, const bool error_tolerant) {
+ if (file_name.empty()) return nullptr;
+
+ FILE* raw_file_ptr;
+
+ bool owns_file = true;
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdin);
+ owns_file = false; // stdin is owned by the Standard C Library.
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "rb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ return nullptr;
+ }
+
+ std::unique_ptr<FileReader> file(
+ new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ if (owns_file) fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (!file->ReadIvfFileHeader()) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+ return nullptr;
+ }
+
+ // With C++11, to return |file|, an explicit move is required as the return
+ // type differs from the local variable. Overload resolution isn't guaranteed
+ // in this case, though some compilers may adopt the C++14 behavior (C++
+ // Standard Core Language Issue #1579, Return by converting move
+ // constructor):
+ // https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1579
+ // To keep things simple we opt for the following compatible form.
+ return std::unique_ptr<FileReaderInterface>(file.release());
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 size of frame in bytes (not including the 12-byte header)
+// bytes 4-11 64-bit presentation timestamp
+// bytes 12.. frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+ int64_t* const timestamp) {
+ if (tu_data == nullptr) return false;
+ tu_data->clear();
+
+ uint8_t header_buffer[kIvfFrameHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+ if (IsEndOfFile()) {
+ if (num_read != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF frame header: Not enough data available");
+ return false;
+ }
+
+ return true;
+ }
+
+ IvfFrameHeader ivf_frame_header;
+ if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+ if (error_tolerant_) {
+ ivf_frame_header.frame_size =
+ std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+ } else {
+ return false;
+ }
+ }
+
+ if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+ tu_data->resize(ivf_frame_header.frame_size);
+ const size_t size_read =
+ fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+ if (size_read != ivf_frame_header.frame_size) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Unexpected EOF or I/O error reading frame data");
+ if (error_tolerant_) {
+ tu_data->resize(size_read);
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 signature: 'DKIF'
+// bytes 4-5 version (should be 0)
+// bytes 6-7 length of header in bytes
+// bytes 8-11 codec FourCC (e.g., 'VP80')
+// bytes 12-13 width in pixels
+// bytes 14-15 height in pixels
+// bytes 16-19 frame rate
+// bytes 20-23 time scale
+// bytes 24-27 number of frames in file
+// bytes 28-31 unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19 frame rate timebase.den framerate.numerator
+// bytes 20-23 time scale timebase.num framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+ uint8_t header_buffer[kIvfFileHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+ if (num_read != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF header: Not enough data available");
+ return false;
+ }
+
+ IvfFileHeader ivf_file_header;
+ if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+ if (error_tolerant_) {
+ ivf_file_header = {};
+ } else {
+ return false;
+ }
+ }
+
+ width_ = ivf_file_header.width;
+ height_ = ivf_file_header.height;
+ frame_rate_ = ivf_file_header.frame_rate_numerator;
+ time_scale_ = ivf_file_header.frame_rate_denominator;
+ type_ = kFileTypeIvf;
+
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+ enum FileType {
+ kFileTypeUnknown,
+ kFileTypeIvf,
+ };
+
+ // Creates and returns a FileReader that reads from |file_name|.
+ // If |error_tolerant| is true format and read errors are ignored,
+ // ReadTemporalUnit() may return truncated data.
+ // Returns nullptr when the file does not exist, cannot be read, or is not an
+ // IVF file.
+ static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+ bool error_tolerant = false);
+
+ FileReader() = delete;
+ FileReader(const FileReader&) = delete;
+ FileReader& operator=(const FileReader&) = delete;
+
+ // Closes |file_|.
+ ~FileReader() override;
+
+ // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp from the IVF frame header.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+ return feof(file_) != 0;
+ }
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from the IVF file header.
+ size_t width() const override { return width_; }
+ size_t height() const override { return height_; }
+ size_t frame_rate() const override { return frame_rate_; }
+ size_t time_scale() const override { return time_scale_; }
+
+ private:
+ FileReader(FILE* file, bool owns_file, bool error_tolerant)
+ : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+ bool ReadIvfFileHeader();
+
+ FILE* file_ = nullptr;
+ size_t width_ = 0;
+ size_t height_ = 0;
+ size_t frame_rate_ = 0;
+ size_t time_scale_ = 0;
+ FileType type_ = kFileTypeUnknown;
+ // True if this object owns file_ and is responsible for closing it when
+ // done.
+ const bool owns_file_;
+ const bool error_tolerant_;
+
+ static bool registered_in_factory_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_constants.h"
+
+namespace libgav1 {
+
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+ kIvfHeaderVersion = 0,
+ kIvfFrameHeaderSize = 12,
+ kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ kMaxTemporalUnitSize = 512 * 1024,
+#else
+ kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+ static auto* open_functions =
+ new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+ return open_functions;
+}
+
+} // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+ if (open_function == nullptr) return false;
+ auto* open_functions = GetFileReaderOpenFunctions();
+ const size_t num_readers = open_functions->size();
+ open_functions->push_back(open_function);
+ return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+ const std::string& file_name, const bool error_tolerant /*= false*/) {
+ for (auto* open_function : *GetFileReaderOpenFunctions()) {
+ auto reader = open_function(file_name, error_tolerant);
+ if (reader == nullptr) continue;
+ return reader;
+ }
+ LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+ return nullptr;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+ using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+ const std::string& file_name, bool error_tolerant);
+
+ FileReaderFactory() = delete;
+ FileReaderFactory(const FileReaderFactory&) = delete;
+ FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+ ~FileReaderFactory() = default;
+
+ // Registers the OpenFunction for a FileReaderInterface and returns true when
+ // registration succeeds.
+ static bool RegisterReader(OpenFunction open_function);
+
+ // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+ // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+ // returned. If |error_tolerant| is true and the reader supports it, some
+ // format and read errors may be ignored and partial data returned.
+ static std::unique_ptr<FileReaderInterface> OpenReader(
+ const std::string& file_name, bool error_tolerant = false);
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class AlwaysFailFileReader : public FileReaderInterface {
+ public:
+ static std::unique_ptr<FileReaderInterface> Open(
+ const std::string& /*file_name*/, bool /*error_tolerant*/) {
+ return nullptr;
+ }
+
+ AlwaysFailFileReader() = delete;
+ AlwaysFailFileReader(const AlwaysFailFileReader&) = delete;
+ AlwaysFailFileReader& operator=(const AlwaysFailFileReader&) = delete;
+ // Note this isn't overridden as the class can never be instantiated. This
+ // avoids an unused function warning.
+ // ~AlwaysFailFileReader() override = default;
+
+ bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+ int64_t* /*pts*/) override {
+ return false;
+ }
+ bool IsEndOfFile() const override { return false; }
+
+ size_t width() const override { return 0; }
+ size_t height() const override { return 0; }
+ size_t frame_rate() const override { return 0; }
+ size_t time_scale() const override { return 0; }
+
+ static bool is_registered_;
+};
+
+class AlwaysOkFileReader : public FileReaderInterface {
+ public:
+ static std::unique_ptr<FileReaderInterface> Open(
+ const std::string& /*file_name*/, bool /*error_tolerant*/) {
+ auto reader = absl::WrapUnique(new (std::nothrow) AlwaysOkFileReader());
+
+ return reader;
+ }
+
+ AlwaysOkFileReader(const AlwaysOkFileReader&) = delete;
+ AlwaysOkFileReader& operator=(const AlwaysOkFileReader&) = delete;
+ ~AlwaysOkFileReader() override = default;
+
+ bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+ int64_t* /*pts*/) override {
+ return true;
+ }
+ bool IsEndOfFile() const override { return true; }
+
+ size_t width() const override { return 1; }
+ size_t height() const override { return 1; }
+ size_t frame_rate() const override { return 1; }
+ size_t time_scale() const override { return 1; }
+
+ static bool is_registered_;
+
+ private:
+ AlwaysOkFileReader() = default;
+};
+
+bool AlwaysFailFileReader::is_registered_ =
+ FileReaderFactory::RegisterReader(AlwaysFailFileReader::Open);
+
+bool AlwaysOkFileReader::is_registered_ =
+ FileReaderFactory::RegisterReader(AlwaysOkFileReader::Open);
+
+TEST(FileReaderFactoryTest, RegistrationFail) {
+ EXPECT_FALSE(FileReaderFactory::RegisterReader(nullptr));
+}
+
+TEST(FileReaderFactoryTest, OpenReader) {
+ ASSERT_TRUE(AlwaysOkFileReader::is_registered_);
+ ASSERT_TRUE(AlwaysFailFileReader::is_registered_);
+
+ auto reader = FileReaderFactory::OpenReader("fake file");
+ EXPECT_NE(reader, nullptr);
+ EXPECT_TRUE(reader->IsEndOfFile());
+ EXPECT_TRUE(reader->ReadTemporalUnit(nullptr, nullptr));
+ EXPECT_EQ(reader->width(), 1);
+ EXPECT_EQ(reader->height(), 1);
+ EXPECT_EQ(reader->frame_rate(), 1);
+ EXPECT_EQ(reader->time_scale(), 1);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+ FileReaderInterface() = default;
+ FileReaderInterface(const FileReaderInterface&) = delete;
+ FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+ FileReaderInterface(FileReaderInterface&&) = default;
+ FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+ // Closes the file.
+ virtual ~FileReaderInterface() = default;
+
+ // Reads a temporal unit from the file and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp of the temporal unit.
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from file.
+ virtual size_t width() const = 0;
+ virtual size_t height() const = 0;
+ virtual size_t frame_rate() const = 0;
+ virtual size_t time_scale() const = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+#include "examples/file_reader_test_common.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// For use with tests that expect Open() failure to distinguish failure due to
+// the file contents versus failure due to a missing file.
+bool FileCanBeRead(const std::string& filename) {
+ FILE* const file = fopen(filename.c_str(), "r");
+ if (file != nullptr) {
+ fclose(file);
+ return true;
+ }
+ return false;
+}
+
+TEST(FileReaderTest, FailOpen) {
+ EXPECT_EQ(FileReader::Open(""), nullptr);
+ const std::string filename =
+ test_utils::GetTestInputFilePath("ivf-signature-only");
+ SCOPED_TRACE("Filename: " + filename);
+ EXPECT_TRUE(FileCanBeRead(filename));
+ EXPECT_EQ(FileReader::Open(filename), nullptr);
+}
+
+TEST(FileReaderTest, Open) {
+ const std::string filenames[] = {
+ test_utils::GetTestInputFilePath("five-frames.ivf"),
+ test_utils::GetTestInputFilePath("ivf-header-and-truncated-frame-header"),
+ test_utils::GetTestInputFilePath("ivf-header-only"),
+ test_utils::GetTestInputFilePath("one-frame-truncated.ivf"),
+ test_utils::GetTestInputFilePath("one-frame.ivf"),
+ };
+ for (const auto& filename : filenames) {
+ EXPECT_NE(FileReader::Open(filename), nullptr) << "Filename: " << filename;
+ }
+}
+
+TEST_P(FileReaderFailTest, FailRead) {
+ ASSERT_FALSE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+}
+
+TEST_P(FileReaderErrorTolerant, ReadThroughEndOfFile) {
+ while (!reader_->IsEndOfFile()) {
+ tu_data_.clear();
+ ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+ ASSERT_GT(tu_data_.size(), 0);
+ }
+}
+
+TEST_P(FileReaderTestNoTimeStamps, ReadThroughEndOfFile) {
+ while (!reader_->IsEndOfFile()) {
+ tu_data_.clear();
+ ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+ }
+}
+
+TEST_P(FileReaderTestWithTimeStamps, ReadThroughEndOfFile) {
+ int64_t timestamp = 0;
+ while (!reader_->IsEndOfFile()) {
+ tu_data_.clear();
+ ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, ×tamp));
+ if (!tu_data_.empty()) {
+ last_timestamp_ = timestamp;
+ }
+ }
+ ASSERT_TRUE(tu_data_.empty());
+ ASSERT_EQ(last_timestamp_, expected_last_timestamp_);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ FailRead, FileReaderFailTest,
+ testing::Values(
+ FileReaderTestParameters(FileReader::Open,
+ "ivf-header-and-truncated-frame-header"),
+ FileReaderTestParameters(FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(ReadThroughEndOfFile, FileReaderErrorTolerant,
+ testing::Values(FileReaderTestParameters(
+ FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+ ReadThroughEndOfFile, FileReaderTestNoTimeStamps,
+ testing::Values(FileReaderTestParameters(FileReader::Open, "one-frame.ivf"),
+ FileReaderTestParameters(FileReader::Open,
+ "one-frame-large-timestamp.ivf"),
+ FileReaderTestParameters(FileReader::Open,
+ "five-frames.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+ ReadThroughEndOfFile, FileReaderTestWithTimeStamps,
+ testing::Values(
+ FileReaderTestWithTimeStampsParameters(FileReader::Open,
+ "one-frame.ivf", 0),
+ FileReaderTestWithTimeStampsParameters(FileReader::Open,
+ "one-frame-large-timestamp.ivf",
+ 4294967296),
+ FileReaderTestWithTimeStampsParameters(FileReader::Open,
+ "five-frames.ivf", 4)));
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_test_common.h"
+
+#include <ostream>
+
+#include "examples/file_reader.h"
+
+namespace libgav1 {
+
+std::ostream& operator<<(std::ostream& stream,
+ const FileReaderTestParameters& parameters) {
+ stream << "open_function="
+ << ((parameters.open_function == FileReader::Open) ? "FileReader"
+ : "Unknown")
+ << ", file_name=" << parameters.file_name;
+ return stream;
+}
+
+std::ostream& operator<<(
+ std::ostream& stream,
+ const FileReaderTestWithTimeStampsParameters& parameters) {
+ stream << "open_function="
+ << ((parameters.open_function == FileReader::Open) ? "FileReader"
+ : "Unknown")
+ << ", file_name=" << parameters.file_name
+ << ", expected_last_timestamp=" << parameters.expected_last_timestamp;
+ return stream;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+
+struct FileReaderTestParameters {
+ FileReaderTestParameters() = default;
+ FileReaderTestParameters(FileReaderFactory::OpenFunction open_function,
+ const char* file_name)
+ : open_function(open_function), file_name(file_name) {}
+ FileReaderTestParameters(const FileReaderTestParameters&) = default;
+ FileReaderTestParameters& operator=(const FileReaderTestParameters&) = delete;
+ FileReaderTestParameters(FileReaderTestParameters&&) = default;
+ FileReaderTestParameters& operator=(FileReaderTestParameters&&) = default;
+ ~FileReaderTestParameters() = default;
+
+ FileReaderFactory::OpenFunction open_function = nullptr;
+ const char* file_name = nullptr;
+};
+
+class FileReaderTestBase {
+ public:
+ FileReaderTestBase() = default;
+ FileReaderTestBase(const FileReaderTestBase&) = delete;
+ FileReaderTestBase& operator=(const FileReaderTestBase&) = delete;
+ FileReaderTestBase(FileReaderTestBase&&) = default;
+ FileReaderTestBase& operator=(FileReaderTestBase&&) = default;
+ ~FileReaderTestBase() = default;
+
+ protected:
+ void OpenReader(const char* file_name,
+ FileReaderFactory::OpenFunction open_function) {
+ file_name_ = test_utils::GetTestInputFilePath(file_name);
+ reader_ = open_function(file_name_, /*error_tolerant=*/false);
+ ASSERT_NE(reader_, nullptr);
+ }
+
+ std::string file_name_;
+ std::unique_ptr<FileReaderInterface> reader_;
+ std::vector<uint8_t> tu_data_;
+};
+
+class FileReaderFailTest
+ : public FileReaderTestBase,
+ public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+ FileReaderFailTest() = default;
+ FileReaderFailTest(const FileReaderTestBase&) = delete;
+ FileReaderFailTest& operator=(const FileReaderTestBase&) = delete;
+ ~FileReaderFailTest() override = default;
+
+ protected:
+ void SetUp() override {
+ OpenReader(GetParam().file_name, GetParam().open_function);
+ }
+};
+
+class FileReaderTestNoTimeStamps
+ : public FileReaderTestBase,
+ public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+ FileReaderTestNoTimeStamps() = default;
+ FileReaderTestNoTimeStamps(const FileReaderTestNoTimeStamps&) = delete;
+ FileReaderTestNoTimeStamps& operator=(const FileReaderTestNoTimeStamps&) =
+ delete;
+ ~FileReaderTestNoTimeStamps() override = default;
+
+ protected:
+ void SetUp() override {
+ OpenReader(GetParam().file_name, GetParam().open_function);
+ }
+};
+
+class FileReaderErrorTolerant
+ : public FileReaderTestBase,
+ public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+ FileReaderErrorTolerant() = default;
+ FileReaderErrorTolerant(const FileReaderErrorTolerant&) = delete;
+ FileReaderErrorTolerant& operator=(const FileReaderErrorTolerant&) = delete;
+ ~FileReaderErrorTolerant() override = default;
+
+ protected:
+ void SetUp() override {
+ file_name_ = test_utils::GetTestInputFilePath(GetParam().file_name);
+ reader_ = GetParam().open_function(file_name_, /*error_tolerant=*/true);
+ ASSERT_NE(reader_, nullptr);
+ }
+};
+
+struct FileReaderTestWithTimeStampsParameters {
+ FileReaderTestWithTimeStampsParameters() = default;
+ FileReaderTestWithTimeStampsParameters(
+ FileReaderFactory::OpenFunction open_function, const char* file_name,
+ int64_t expected_last_timestamp)
+ : open_function(open_function),
+ file_name(file_name),
+ expected_last_timestamp(expected_last_timestamp) {}
+ FileReaderTestWithTimeStampsParameters(
+ const FileReaderTestWithTimeStampsParameters&) = default;
+ FileReaderTestWithTimeStampsParameters& operator=(
+ const FileReaderTestWithTimeStampsParameters&) = delete;
+ FileReaderTestWithTimeStampsParameters(
+ FileReaderTestWithTimeStampsParameters&&) = default;
+ FileReaderTestWithTimeStampsParameters& operator=(
+ FileReaderTestWithTimeStampsParameters&&) = default;
+ ~FileReaderTestWithTimeStampsParameters() = default;
+
+ FileReaderFactory::OpenFunction open_function = nullptr;
+ const char* file_name = nullptr;
+ int64_t expected_last_timestamp = 0;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+ const FileReaderTestParameters& parameters);
+
+std::ostream& operator<<(
+ std::ostream& stream,
+ const FileReaderTestWithTimeStampsParameters& parameters);
+
+class FileReaderTestWithTimeStamps
+ : public FileReaderTestBase,
+ public testing::TestWithParam<FileReaderTestWithTimeStampsParameters> {
+ public:
+ FileReaderTestWithTimeStamps() = default;
+ FileReaderTestWithTimeStamps(const FileReaderTestWithTimeStamps&) = delete;
+ FileReaderTestWithTimeStamps& operator=(const FileReaderTestWithTimeStamps&) =
+ delete;
+ ~FileReaderTestWithTimeStamps() override = default;
+
+ protected:
+ void SetUp() override {
+ FileReaderTestWithTimeStampsParameters parameters = GetParam();
+ OpenReader(parameters.file_name, parameters.open_function);
+ expected_last_timestamp_ = parameters.expected_last_timestamp;
+ }
+
+ int64_t last_timestamp_ = 0;
+ int64_t expected_last_timestamp_ = 0;
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+std::string GetY4mColorSpaceString(
+ const FileWriter::Y4mParameters& y4m_parameters) {
+ std::string color_space_string;
+ switch (y4m_parameters.image_format) {
+ case kImageFormatMonochrome400:
+ color_space_string = "mono";
+ break;
+ case kImageFormatYuv420:
+ if (y4m_parameters.bitdepth == 8) {
+ if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionVertical) {
+ color_space_string = "420mpeg2";
+ } else if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionColocated) {
+ color_space_string = "420";
+ } else {
+ color_space_string = "420jpeg";
+ }
+ } else {
+ color_space_string = "420";
+ }
+ break;
+ case kImageFormatYuv422:
+ color_space_string = "422";
+ break;
+ case kImageFormatYuv444:
+ color_space_string = "444";
+ break;
+ }
+
+ if (y4m_parameters.bitdepth > 8) {
+ const bool monochrome =
+ y4m_parameters.image_format == kImageFormatMonochrome400;
+ if (!monochrome) color_space_string += "p";
+ color_space_string += std::to_string(y4m_parameters.bitdepth);
+ }
+
+ return color_space_string;
+}
+
+} // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+ const std::string& file_name, FileType file_type,
+ const Y4mParameters* const y4m_parameters) {
+ if (file_name.empty() ||
+ (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+ (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+ return nullptr;
+ }
+
+ FILE* raw_file_ptr;
+
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdout);
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "wb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+ return nullptr;
+ }
+
+ std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+ return nullptr;
+ }
+
+ file->file_type_ = file_type;
+ return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+ if (file_type_ == kFileTypeY4m) {
+ const char kY4mFrameHeader[] = "FRAME\n";
+ if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+ strlen(kY4mFrameHeader)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+ return false;
+ }
+ }
+
+ const size_t pixel_size =
+ (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+ ++plane_index) {
+ const int height = frame_buffer.displayed_height[plane_index];
+ const int width = frame_buffer.displayed_width[plane_index];
+ const int stride = frame_buffer.stride[plane_index];
+ const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+ for (int row = 0; row < height; ++row) {
+ const uint8_t* const row_pointer = &plane_pointer[row * stride];
+ if (fwrite(row_pointer, pixel_size, width, file_) !=
+ static_cast<size_t>(width)) {
+ char error_string[256];
+ snprintf(error_string, sizeof(error_string),
+ "File write failed: %s (errno=%d)", strerror(errno), errno);
+ LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+ std::string y4m_header = "YUV4MPEG2";
+ y4m_header += " W" + std::to_string(y4m_parameters.width);
+ y4m_header += " H" + std::to_string(y4m_parameters.height);
+ y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+ ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+ y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+ y4m_header += "\n";
+ return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+ y4m_header.length();
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+ enum FileType : uint8_t {
+ kFileTypeRaw,
+ kFileTypeY4m,
+ };
+
+ struct Y4mParameters {
+ Y4mParameters() = default;
+ Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+ size_t frame_rate_denominator,
+ ChromaSamplePosition chroma_sample_position,
+ ImageFormat image_format, size_t bitdepth)
+ : width(width),
+ height(height),
+ frame_rate_numerator(frame_rate_numerator),
+ frame_rate_denominator(frame_rate_denominator),
+ chroma_sample_position(chroma_sample_position),
+ image_format(image_format),
+ bitdepth(bitdepth) {}
+
+ Y4mParameters(const Y4mParameters& rhs) = default;
+ Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+ Y4mParameters(Y4mParameters&& rhs) = default;
+ Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 30;
+ size_t frame_rate_denominator = 1;
+ ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+ ImageFormat image_format = kImageFormatYuv420;
+ size_t bitdepth = 8;
+ };
+
+ // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+ // written out to |file_| before this method returns.
+ //
+ // Returns a FileWriter instance after the file is opened successfully for
+ // kFileTypeRaw files, and after the Y4M file header bytes are written for
+ // kFileTypeY4m files. Returns nullptr upon failure.
+ static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+ FileType type,
+ const Y4mParameters* y4m_parameters);
+
+ FileWriter() = delete;
+ FileWriter(const FileWriter&) = delete;
+ FileWriter& operator=(const FileWriter&) = delete;
+
+ FileWriter(FileWriter&&) = default;
+ FileWriter& operator=(FileWriter&&) = default;
+
+ // Closes |file_|.
+ ~FileWriter();
+
+ // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+ // successful write of |frame_buffer| data.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+ const DecoderBuffer& frame_buffer);
+
+ private:
+ explicit FileWriter(FILE* file) : file_(file) {}
+
+ bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+ FILE* file_ = nullptr;
+ FileType file_type_ = kFileTypeRaw;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_WRITER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "gav1/decoder_buffer.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+const char kExpectedY4mHeader8bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420jpeg\n";
+const char kExpectedY4mHeader10bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420p10\n";
+const char kExpectedY4mHeader8bitMonochrome[] =
+ "YUV4MPEG2 W352 H288 F30:1 Ip Cmono\n";
+const char kExpectedY4mHeader10bitMonochrome[] =
+ "YUV4MPEG2 W352 H288 F30:1 Ip Cmono10\n";
+
+// Note: These are non-const because DecoderBuffer.plane is non-const.
+char fake_plane0[] = "PLANE0\n";
+char fake_plane1[] = "PLANE1\n";
+char fake_plane2[] = "PLANE2\n";
+
+constexpr size_t kExpectedRawDataBufferCount = 3;
+const char* kExpectedRawData[kExpectedRawDataBufferCount] = {
+ fake_plane0, fake_plane1, fake_plane2};
+
+const char* const kExpectedRawDataMonochrome = fake_plane0;
+
+constexpr size_t kExpectedY4mDataBufferCount = 5;
+const char* const kExpectedY4mFileData8bit[kExpectedY4mDataBufferCount] = {
+ kExpectedY4mHeader8bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+const char* const kExpectedY4mFileData10bit[kExpectedY4mDataBufferCount] = {
+ kExpectedY4mHeader10bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+
+constexpr size_t kExpectedY4mDataBufferCountMonochrome = 3;
+const char* const
+ kExpectedY4mFileData8bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+ {kExpectedY4mHeader8bitMonochrome, "FRAME\n", fake_plane0};
+const char* const
+ kExpectedY4mFileData10bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+ {kExpectedY4mHeader10bitMonochrome, "FRAME\n", fake_plane0};
+
+// TODO(tomfinegan): Add a bitdepth arg, and test writing 10 bit frame buffers.
+std::unique_ptr<DecoderBuffer> GetFakeDecoderBuffer(ImageFormat image_format) {
+ auto buffer = absl::WrapUnique(new (std::nothrow) DecoderBuffer);
+ if (buffer == nullptr) return nullptr;
+ buffer->chroma_sample_position = kChromaSamplePositionUnknown;
+ buffer->image_format = image_format;
+ buffer->bitdepth = 8;
+ buffer->displayed_width[0] = static_cast<int>(strlen(fake_plane0));
+ buffer->displayed_width[1] = static_cast<int>(strlen(fake_plane1));
+ buffer->displayed_width[2] = static_cast<int>(strlen(fake_plane2));
+ buffer->displayed_height[0] = 1;
+ buffer->displayed_height[1] = 1;
+ buffer->displayed_height[2] = 1;
+ buffer->stride[0] = static_cast<int>(strlen(fake_plane0));
+ buffer->stride[1] = static_cast<int>(strlen(fake_plane1));
+ buffer->stride[2] = static_cast<int>(strlen(fake_plane2));
+ buffer->plane[0] = reinterpret_cast<uint8_t*>(fake_plane0);
+ buffer->plane[1] = reinterpret_cast<uint8_t*>(fake_plane1);
+ buffer->plane[2] = reinterpret_cast<uint8_t*>(fake_plane2);
+ buffer->user_private_data = 0;
+ buffer->buffer_private_data = nullptr;
+ return buffer;
+}
+
+TEST(FileWriterTest, FailOpen) {
+ EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+ static_cast<FileWriter::FileType>(3), nullptr),
+ nullptr);
+ EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+ FileWriter::kFileTypeY4m, nullptr),
+ nullptr);
+}
+
+struct FileWriterY4mHeaderTestParameters {
+ FileWriterY4mHeaderTestParameters() = default;
+ FileWriterY4mHeaderTestParameters(const FileWriterY4mHeaderTestParameters&) =
+ default;
+ FileWriterY4mHeaderTestParameters& operator=(
+ const FileWriterY4mHeaderTestParameters&) = default;
+ FileWriterY4mHeaderTestParameters(FileWriterY4mHeaderTestParameters&&) =
+ default;
+ FileWriterY4mHeaderTestParameters& operator=(
+ FileWriterY4mHeaderTestParameters&&) = default;
+ ~FileWriterY4mHeaderTestParameters() = default;
+
+ FileWriterY4mHeaderTestParameters(std::string file_name,
+ ChromaSamplePosition chroma_sample_position,
+ ImageFormat image_format, int bitdepth,
+ const char* expected_header_string)
+ : file_name(std::move(file_name)),
+ chroma_sample_position(chroma_sample_position),
+ image_format(image_format),
+ bitdepth(bitdepth),
+ expected_header_string(expected_header_string) {}
+ std::string file_name;
+ ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+ ImageFormat image_format = kImageFormatMonochrome400;
+ int bitdepth = 8;
+ const char* expected_header_string = nullptr;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+ const FileWriterY4mHeaderTestParameters& parameters) {
+ stream << "file_name=" << parameters.file_name << "\n"
+ << "chroma_sample_position=" << parameters.chroma_sample_position
+ << "\n"
+ << "image_format=" << parameters.image_format << "\n"
+ << "bitdepth=" << parameters.bitdepth << "\n"
+ << "expected_header_string=" << parameters.expected_header_string
+ << "\n";
+ return stream;
+}
+
+class FileWriterY4mHeaderTest
+ : public testing::TestWithParam<FileWriterY4mHeaderTestParameters> {
+ public:
+ FileWriterY4mHeaderTest() {
+ test_parameters_ = GetParam();
+ y4m_parameters_.width = 352;
+ y4m_parameters_.height = 288;
+ y4m_parameters_.frame_rate_numerator = 30;
+ y4m_parameters_.frame_rate_denominator = 1;
+ y4m_parameters_.chroma_sample_position =
+ test_parameters_.chroma_sample_position;
+ y4m_parameters_.image_format = test_parameters_.image_format;
+ y4m_parameters_.bitdepth = test_parameters_.bitdepth;
+ }
+ FileWriterY4mHeaderTest(const FileWriterY4mHeaderTest&) = delete;
+ FileWriterY4mHeaderTest& operator=(const FileWriterY4mHeaderTest&) = delete;
+ ~FileWriterY4mHeaderTest() override = default;
+
+ protected:
+ FileWriterY4mHeaderTestParameters test_parameters_;
+ FileWriter::Y4mParameters y4m_parameters_;
+};
+
+TEST_P(FileWriterY4mHeaderTest, WriteY4mHeader) {
+ const std::string file_name =
+ test_utils::GetTestOutputFilePath(test_parameters_.file_name);
+ EXPECT_NE(
+ FileWriter::Open(file_name, FileWriter::kFileTypeY4m, &y4m_parameters_),
+ nullptr);
+ std::string y4m_header_string;
+ test_utils::GetTestData(test_parameters_.file_name, true, &y4m_header_string);
+ EXPECT_STREQ(y4m_header_string.c_str(),
+ test_parameters_.expected_header_string);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ WriteY4mHeader, FileWriterY4mHeaderTest,
+ testing::Values(
+ FileWriterY4mHeaderTestParameters(
+ "y4m_header_8bit", kChromaSamplePositionUnknown, kImageFormatYuv420,
+ /*bitdepth=*/8, kExpectedY4mHeader8bit),
+ FileWriterY4mHeaderTestParameters("y4m_header_10bit",
+ kChromaSamplePositionUnknown,
+ kImageFormatYuv420, /*bitdepth=*/10,
+ kExpectedY4mHeader10bit),
+ FileWriterY4mHeaderTestParameters("y4m_header_8bit_monochrome",
+ kChromaSamplePositionUnknown,
+ kImageFormatMonochrome400,
+ /*bitdepth=*/8,
+ kExpectedY4mHeader8bitMonochrome),
+ FileWriterY4mHeaderTestParameters("y4m_header_10bit_monochrome",
+ kChromaSamplePositionUnknown,
+ kImageFormatMonochrome400,
+ /*bitdepth=*/10,
+ kExpectedY4mHeader10bitMonochrome)));
+
+struct FileWriterTestParameters {
+ FileWriterTestParameters() = default;
+ FileWriterTestParameters(const FileWriterTestParameters&) = default;
+ FileWriterTestParameters& operator=(const FileWriterTestParameters&) =
+ default;
+ FileWriterTestParameters(FileWriterTestParameters&&) = default;
+ FileWriterTestParameters& operator=(FileWriterTestParameters&&) = default;
+ ~FileWriterTestParameters() = default;
+
+ FileWriterTestParameters(std::string file_name,
+ FileWriter::FileType file_type,
+ const FileWriter::Y4mParameters* y4m_parameters,
+ size_t num_frames)
+ : file_name(std::move(file_name)),
+ file_type(file_type),
+ y4m_parameters(y4m_parameters),
+ num_frames(num_frames) {}
+ std::string file_name;
+ FileWriter::FileType file_type = FileWriter::kFileTypeRaw;
+ const FileWriter::Y4mParameters* y4m_parameters = nullptr;
+ size_t num_frames = 1;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+ const ChromaSamplePosition& position) {
+ switch (position) {
+ case kChromaSamplePositionUnknown:
+ stream << "kCromaSamplePositionUnknown";
+ break;
+ case kChromaSamplePositionVertical:
+ stream << "kChromaSamplePositionVertical";
+ break;
+ case kChromaSamplePositionColocated:
+ stream << "kChromaSamplePositionColocated";
+ break;
+ case kChromaSamplePositionReserved:
+ stream << "kChromaSamplePositionReserved";
+ break;
+ }
+ return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+ const ImageFormat& image_format) {
+ switch (image_format) {
+ case kImageFormatMonochrome400:
+ stream << "kImageFormatMonochrome400";
+ break;
+ case kImageFormatYuv420:
+ stream << "kImageFormatYuv420";
+ break;
+ case kImageFormatYuv422:
+ stream << "kImageFormatYuv422";
+ break;
+ case kImageFormatYuv444:
+ stream << "kImageFormatYuv444";
+ break;
+ }
+ return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+ const FileWriter::Y4mParameters& parameters) {
+ stream << "y4m_parameters:\n"
+ << " width=" << parameters.width << "\n"
+ << " height=" << parameters.height << "\n"
+ << " frame_rate_numerator=" << parameters.frame_rate_numerator << "\n"
+ << " frame_rate_denominator=" << parameters.frame_rate_denominator
+ << "\n"
+ << " chroma_sample_position=" << parameters.chroma_sample_position
+ << "\n"
+ << " image_format=" << parameters.image_format << "\n"
+ << " bitdepth=" << parameters.bitdepth << "\n";
+
+ return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+ const FileWriterTestParameters& parameters) {
+ stream << "file_name=" << parameters.file_name << "\n"
+ << "file_type="
+ << (parameters.file_type == FileWriter::kFileTypeRaw ? "kFileTypeRaw"
+ : "kFileTypeY4m")
+ << "\n";
+ if (parameters.y4m_parameters != nullptr) {
+ stream << *parameters.y4m_parameters;
+ } else {
+ stream << "y4m_parameters: <nullptr>\n";
+ }
+ stream << "num_frames=" << parameters.num_frames << "\n";
+ return stream;
+}
+
+class FileWriterTestBase
+ : public testing::TestWithParam<FileWriterTestParameters> {
+ public:
+ FileWriterTestBase() = default;
+ FileWriterTestBase(const FileWriterTestBase&) = delete;
+ FileWriterTestBase& operator=(const FileWriterTestBase&) = delete;
+ ~FileWriterTestBase() override = default;
+
+ protected:
+ void SetUp() override { OpenWriter(GetParam()); }
+
+ void OpenWriter(const FileWriterTestParameters& parameters) {
+ parameters_ = parameters;
+ parameters_.file_name = parameters.file_name;
+ file_writer_ = FileWriter::Open(
+ test_utils::GetTestOutputFilePath(parameters.file_name),
+ parameters_.file_type, parameters_.y4m_parameters);
+ ASSERT_NE(file_writer_, nullptr);
+ }
+
+ void WriteFramesAndCloseFile() {
+ if (parameters_.y4m_parameters != nullptr) {
+ image_format_ = parameters_.y4m_parameters->image_format;
+ }
+ decoder_buffer_ = GetFakeDecoderBuffer(image_format_);
+ for (size_t frame_num = 0; frame_num < parameters_.num_frames;
+ ++frame_num) {
+ ASSERT_TRUE(file_writer_->WriteFrame(*decoder_buffer_));
+ }
+ file_writer_ = nullptr;
+ }
+
+ ImageFormat image_format_ = kImageFormatYuv420;
+ FileWriterTestParameters parameters_;
+ std::unique_ptr<FileWriter> file_writer_;
+ std::unique_ptr<DecoderBuffer> decoder_buffer_;
+};
+
+class FileWriterTestRaw : public FileWriterTestBase {
+ public:
+ FileWriterTestRaw() = default;
+ FileWriterTestRaw(const FileWriterTestRaw&) = delete;
+ FileWriterTestRaw& operator=(const FileWriterTestRaw&) = delete;
+ ~FileWriterTestRaw() override = default;
+
+ protected:
+ void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+class FileWriterTestY4m : public FileWriterTestBase {
+ public:
+ FileWriterTestY4m() = default;
+ FileWriterTestY4m(const FileWriterTestY4m&) = delete;
+ FileWriterTestY4m& operator=(const FileWriterTestY4m&) = delete;
+ ~FileWriterTestY4m() override = default;
+
+ protected:
+ void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+TEST_P(FileWriterTestRaw, WriteRawFrames) {
+ WriteFramesAndCloseFile();
+
+ std::string actual_file_data;
+ test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+ std::string expected_file_data;
+ for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+ if (image_format_ == kImageFormatMonochrome400) {
+ expected_file_data += kExpectedRawDataMonochrome;
+ } else {
+ for (const auto& buffer : kExpectedRawData) {
+ expected_file_data += buffer;
+ }
+ }
+ }
+
+ ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+TEST_P(FileWriterTestY4m, WriteY4mFrames) {
+ WriteFramesAndCloseFile();
+
+ std::string actual_file_data;
+ test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+ std::string expected_file_data;
+ for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+ if (image_format_ == kImageFormatMonochrome400) {
+ const char* const* expected_data_planes =
+ (parameters_.y4m_parameters->bitdepth == 8)
+ ? kExpectedY4mFileData8bitMonochrome
+ : kExpectedY4mFileData10bitMonochrome;
+ // Skip the Y4M file header "plane" after frame 0.
+ for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+ buffer_num < kExpectedY4mDataBufferCountMonochrome; ++buffer_num) {
+ expected_file_data += expected_data_planes[buffer_num];
+ }
+ } else {
+ const char* const* expected_data_planes =
+ (parameters_.y4m_parameters->bitdepth == 8)
+ ? kExpectedY4mFileData8bit
+ : kExpectedY4mFileData10bit;
+
+ // Skip the Y4M file header "plane" after frame 0.
+ for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+ buffer_num < kExpectedY4mDataBufferCount; ++buffer_num) {
+ expected_file_data += expected_data_planes[buffer_num];
+ }
+ }
+ }
+
+ ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ WriteRawFrames, FileWriterTestRaw,
+ testing::Values(
+ FileWriterTestParameters("raw_frames_test_1frame",
+ FileWriter::kFileTypeRaw,
+ /*y4m_parameters=*/nullptr,
+ /*num_frames=*/1),
+ FileWriterTestParameters("raw_frames_test_5frames",
+ FileWriter::kFileTypeRaw,
+ /*y4m_parameters=*/nullptr,
+ /*num_frames=*/5),
+ FileWriterTestParameters("raw_frames_test_1frame_monochrome",
+ FileWriter::kFileTypeRaw,
+ /*y4m_parameters=*/nullptr,
+ /*num_frames=*/1),
+ FileWriterTestParameters("raw_frames_test_5frames_monochrome",
+ FileWriter::kFileTypeRaw,
+ /*y4m_parameters=*/nullptr,
+ /*num_frames=*/5)));
+
+const FileWriter::Y4mParameters kY4mParameters8Bit = {
+ 352, // width
+ 288, // height
+ 30, // frame_rate_numerator
+ 1, // frame_rate_denominator
+ kChromaSamplePositionUnknown,
+ kImageFormatYuv420,
+ 8 // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10Bit = {
+ 352, // width
+ 288, // height
+ 30, // frame_rate_numerator
+ 1, // frame_rate_denominator
+ kChromaSamplePositionUnknown,
+ kImageFormatYuv420,
+ 10 // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters8BitMonochrome = {
+ 352, // width
+ 288, // height
+ 30, // frame_rate_numerator
+ 1, // frame_rate_denominator
+ kChromaSamplePositionUnknown,
+ kImageFormatMonochrome400,
+ 8 // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10BitMonochrome = {
+ 352, // width
+ 288, // height
+ 30, // frame_rate_numerator
+ 1, // frame_rate_denominator
+ kChromaSamplePositionUnknown,
+ kImageFormatMonochrome400,
+ 10 // bitdepth
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ WriteY4mFrames, FileWriterTestY4m,
+ testing::Values(
+ FileWriterTestParameters("y4m_frames_test_8bit_1frame",
+ FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+ /*num_frames=*/1),
+ FileWriterTestParameters("y4m_frames_test_8bit_5frames",
+ FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+ /*num_frames=*/5),
+ FileWriterTestParameters("y4m_frames_test_10bit_1frame",
+ FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+ /*num_frames=*/1),
+ FileWriterTestParameters("y4m_frames_test_10bit_5frames",
+ FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+ /*num_frames=*/5),
+ FileWriterTestParameters("y4m_frames_test_8bit_1frame_monochrome",
+ FileWriter::kFileTypeY4m,
+ &kY4mParameters8BitMonochrome,
+ /*num_frames=*/1),
+ FileWriterTestParameters("y4m_frames_test_8bit_5frames_monochrome",
+ FileWriter::kFileTypeY4m,
+ &kY4mParameters8BitMonochrome,
+ /*num_frames=*/5),
+ FileWriterTestParameters("y4m_frames_test_10bit_1frame_monochrome",
+ FileWriter::kFileTypeY4m,
+ &kY4mParameters10BitMonochrome,
+ /*num_frames=*/1),
+ FileWriterTestParameters("y4m_frames_test_10bit_5frames_monochrome",
+ FileWriter::kFileTypeY4m,
+ &kY4mParameters10BitMonochrome,
+ /*num_frames=*/5)));
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+ const char* input_file_name = nullptr;
+ const char* output_file_name = nullptr;
+ const char* frame_timing_file_name = nullptr;
+ libgav1::FileWriter::FileType output_file_type =
+ libgav1::FileWriter::kFileTypeRaw;
+ uint8_t post_filter_mask = 0x1f;
+ int threads = 1;
+ bool frame_parallel = false;
+ bool output_all_layers = false;
+ int operating_point = 0;
+ int limit = 0;
+ int skip = 0;
+ int verbose = 0;
+};
+
+struct Timing {
+ absl::Duration input;
+ absl::Duration dequeue;
+};
+
+struct FrameTiming {
+ absl::Time enqueue;
+ absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+ fprintf(fout,
+ "Usage: gav1_decode [options] <input file>"
+ " [-o <output file>]\n");
+ fprintf(fout, "\n");
+ fprintf(fout, "Options:\n");
+ fprintf(fout, " -h, --help This help message.\n");
+ fprintf(fout, " --threads <positive integer> (Default 1).\n");
+ fprintf(fout, " --frame_parallel.\n");
+ fprintf(fout,
+ " --limit <integer> Stop decoding after N frames (0 = all).\n");
+ fprintf(fout, " --skip <integer> Skip initial N frames (Default 0).\n");
+ fprintf(fout, " --version.\n");
+ fprintf(fout, " --y4m (Default false).\n");
+ fprintf(fout, " --raw (Default true).\n");
+ fprintf(fout, " -v logging verbosity, can be used multiple times.\n");
+ fprintf(fout, " --all_layers.\n");
+ fprintf(fout,
+ " --operating_point <integer between 0 and 31> (Default 0).\n");
+ fprintf(fout,
+ " --frame_timing <file> Output per-frame timing to <file> in tsv"
+ " format.\n Yields meaningful results only when frame parallel is"
+ " off.\n");
+ fprintf(fout, "\nAdvanced settings:\n");
+ fprintf(fout, " --post_filter_mask <integer> (Default 0x1f).\n");
+ fprintf(fout,
+ " Mask indicating which post filters should be applied to the"
+ " reconstructed\n frame. This may be given as octal, decimal or"
+ " hexadecimal. From LSB:\n");
+ fprintf(fout, " Bit 0: Loop filter (deblocking filter)\n");
+ fprintf(fout, " Bit 1: Cdef\n");
+ fprintf(fout, " Bit 2: SuperRes\n");
+ fprintf(fout, " Bit 3: Loop Restoration\n");
+ fprintf(fout, " Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+ for (int i = 1; i < argc; ++i) {
+ int32_t value;
+ if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+ PrintHelp(stdout);
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-o") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '-o'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->output_file_name = argv[i];
+ } else if (strcmp(argv[i], "--frame_timing") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '--frame_timing'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->frame_timing_file_name = argv[i];
+ } else if (strcmp(argv[i], "--version") == 0) {
+ printf("gav1_decode, a libgav1 based AV1 decoder\n");
+ printf("libgav1 %s\n", libgav1::GetVersionString());
+ printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+ printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-v") == 0) {
+ ++options->verbose;
+ } else if (strcmp(argv[i], "--raw") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+ } else if (strcmp(argv[i], "--y4m") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+ } else if (strcmp(argv[i], "--threads") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+ fprintf(stderr, "Missing/Invalid value for --threads.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->threads = value;
+ } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+ options->frame_parallel = true;
+ } else if (strcmp(argv[i], "--all_layers") == 0) {
+ options->output_all_layers = true;
+ } else if (strcmp(argv[i], "--operating_point") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+ value >= 32) {
+ fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->operating_point = value;
+ } else if (strcmp(argv[i], "--limit") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --limit.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->limit = value;
+ } else if (strcmp(argv[i], "--skip") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --skip.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->skip = value;
+ } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+ errno = 0;
+ char* endptr = nullptr;
+ value = (++i >= argc) ? -1
+ // NOLINTNEXTLINE(runtime/deprecated_fn)
+ : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+ // Only the last 5 bits of the mask can be set.
+ if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+ fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->post_filter_mask = value;
+ } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+ fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+ exit(EXIT_FAILURE);
+ } else {
+ if (options->input_file_name == nullptr) {
+ options->input_file_name = argv[i];
+ } else {
+ fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+
+ if (argc < 2 || options->input_file_name == nullptr) {
+ fprintf(stderr, "Input file is required!\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+ ~InputBuffers() {
+ for (auto buffer : free_buffers_) {
+ delete buffer;
+ }
+ }
+ InputBuffer* GetFreeBuffer() {
+ if (free_buffers_.empty()) {
+ auto* const buffer = new (std::nothrow) InputBuffer();
+ if (buffer == nullptr) {
+ fprintf(stderr, "Failed to create input buffer.\n");
+ return nullptr;
+ }
+ free_buffers_.push_back(buffer);
+ }
+ InputBuffer* const buffer = free_buffers_.front();
+ free_buffers_.pop_front();
+ return buffer;
+ }
+
+ void ReleaseInputBuffer(InputBuffer* buffer) {
+ free_buffers_.push_back(buffer);
+ }
+
+ private:
+ std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+ input_buffers->ReleaseInputBuffer(
+ static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+ Options options;
+ ParseOptions(argc, argv, &options);
+
+ auto file_reader =
+ libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+ if (file_reader == nullptr) {
+ fprintf(stderr, "Cannot open input file!\n");
+ return EXIT_FAILURE;
+ }
+
+ std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+ &CloseFile);
+ if (options.frame_timing_file_name != nullptr) {
+ frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+ if (frame_timing_file == nullptr) {
+ fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+ options.frame_timing_file_name);
+ return EXIT_FAILURE;
+ }
+ }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ // Reference frames + 1 scratch frame (for either the current frame or the
+ // film grain frame).
+ constexpr int kNumBuffers = 8 + 1;
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+ Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+ if (cv_pixel_buffers == nullptr) {
+ fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+ return EXIT_FAILURE;
+ }
+#endif
+
+ InputBuffers input_buffers;
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings;
+ settings.post_filter_mask = options.post_filter_mask;
+ settings.threads = options.threads;
+ settings.frame_parallel = options.frame_parallel;
+ settings.output_all_layers = options.output_all_layers;
+ settings.operating_point = options.operating_point;
+ settings.blocking_dequeue = true;
+ settings.callback_private_data = &input_buffers;
+ settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+ settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+ settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+ settings.callback_private_data = cv_pixel_buffers.get();
+ settings.release_input_buffer = nullptr;
+ // TODO(vigneshv): Support frame parallel mode to be used with
+ // CVPixelBufferPool.
+ settings.frame_parallel = false;
+#endif
+ libgav1::StatusCode status = decoder.Init(&settings);
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Error initializing decoder: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+
+ fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+ if (options.verbose > 0 && options.skip > 0) {
+ fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+ }
+
+ int input_frames = 0;
+ int decoded_frames = 0;
+ Timing timing = {};
+ std::vector<FrameTiming> frame_timing;
+ const bool record_frame_timing = frame_timing_file != nullptr;
+ std::unique_ptr<libgav1::FileWriter> file_writer;
+ InputBuffer* input_buffer = nullptr;
+ bool limit_reached = false;
+ bool dequeue_finished = false;
+ const absl::Time decode_loop_start = absl::Now();
+ do {
+ if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+ !limit_reached) {
+ input_buffer = input_buffers.GetFreeBuffer();
+ if (input_buffer == nullptr) return EXIT_FAILURE;
+ const absl::Time read_start = absl::Now();
+ if (!file_reader->ReadTemporalUnit(input_buffer,
+ /*timestamp=*/nullptr)) {
+ fprintf(stderr, "Error reading input file.\n");
+ return EXIT_FAILURE;
+ }
+ timing.input += absl::Now() - read_start;
+ }
+
+ if (++input_frames <= options.skip) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ if (input_buffer != nullptr) {
+ if (input_buffer->empty()) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ const absl::Time enqueue_start = absl::Now();
+ status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+ static_cast<int64_t>(frame_timing.size()),
+ /*buffer_private_data=*/input_buffer);
+ if (status == libgav1::kStatusOk) {
+ if (options.verbose > 1) {
+ fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+ }
+ if (record_frame_timing) {
+ FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+ frame_timing.emplace_back(enqueue_time);
+ }
+
+ input_buffer = nullptr;
+ // Continue to enqueue frames until we get a kStatusTryAgain status.
+ continue;
+ }
+ if (status != libgav1::kStatusTryAgain) {
+ fprintf(stderr, "Unable to enqueue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ }
+
+ const libgav1::DecoderBuffer* buffer;
+ status = decoder.DequeueFrame(&buffer);
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ continue;
+ }
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Unable to dequeue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ dequeue_finished = false;
+ if (buffer == nullptr) continue;
+ ++decoded_frames;
+ if (options.verbose > 1) {
+ fprintf(stderr, "buffer dequeued\n");
+ }
+
+ if (record_frame_timing) {
+ frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+ absl::Now();
+ }
+
+ if (options.output_file_name != nullptr && file_writer == nullptr) {
+ libgav1::FileWriter::Y4mParameters y4m_parameters;
+ y4m_parameters.width = buffer->displayed_width[0];
+ y4m_parameters.height = buffer->displayed_height[0];
+ y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+ y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+ y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+ y4m_parameters.image_format = buffer->image_format;
+ y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+ file_writer = libgav1::FileWriter::Open(
+ options.output_file_name, options.output_file_type, &y4m_parameters);
+ if (file_writer == nullptr) {
+ fprintf(stderr, "Cannot open output file!\n");
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (!limit_reached && file_writer != nullptr &&
+ !file_writer->WriteFrame(*buffer)) {
+ fprintf(stderr, "Error writing output file.\n");
+ return EXIT_FAILURE;
+ }
+ if (options.limit > 0 && options.limit == decoded_frames) {
+ limit_reached = true;
+ if (input_buffer != nullptr) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ }
+ input_buffer = nullptr;
+ // Clear any in progress frames to ensure the output frame limit is
+ // respected.
+ decoder.SignalEOS();
+ }
+ } while (input_buffer != nullptr ||
+ (!file_reader->IsEndOfFile() && !limit_reached) ||
+ !dequeue_finished);
+ timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+ if (record_frame_timing) {
+ // Note timing for frame parallel will be skewed by the time spent queueing
+ // additional frames and in the output queue waiting for previous frames,
+ // the values reported won't be that meaningful.
+ fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+ for (size_t i = 0; i < frame_timing.size(); ++i) {
+ const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+ frame_timing[i].dequeue - frame_timing[i].enqueue));
+ fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+ }
+ }
+
+ if (options.verbose > 0) {
+ fprintf(stderr, "time to read input: %d us\n",
+ static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+ const int decode_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+ const double decode_fps =
+ (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+ fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+ decode_time_us, decoded_frames, decode_fps);
+ }
+
+ return EXIT_SUCCESS;
+}
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+ void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+ std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+ std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+} // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->OnCVPixelBufferSizeChanged(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->GetCVPixelBuffer(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+} // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+ new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+ return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+ : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+ CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment) {
+ if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+ image_format != libgav1::kImageFormatMonochrome400)) {
+ fprintf(stderr,
+ "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+ "image_format: %d.\n",
+ bitdepth, image_format);
+ return libgav1::kStatusUnimplemented;
+ }
+
+ // stride_alignment must be a power of 2.
+ assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+ // The possible keys for CVPixelBufferPool are:
+ // kCVPixelBufferPoolMinimumBufferCountKey
+ // kCVPixelBufferPoolMaximumBufferAgeKey
+ // kCVPixelBufferPoolAllocationThresholdKey
+ const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+ const int min_buffer_count = 10;
+ UniqueCFNumberRef cf_min_buffer_count(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+ if (cf_min_buffer_count == nullptr) {
+ fprintf(stderr, "CFNumberCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ const void* pool_values[] = {cf_min_buffer_count.get()};
+ UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+ nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+ &kCFTypeDictionaryValueCallBacks));
+ if (pool_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+ // null and must contain the pixel format, width, and height, otherwise
+ // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+ // (-6682).
+
+ // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+ const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+ ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+ : kCVPixelFormatType_OneComponent8;
+ UniqueCFNumberRef cf_pixel_format(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+ UniqueCFNumberRef cf_width(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+ UniqueCFNumberRef cf_height(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+ UniqueCFNumberRef cf_left_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+ UniqueCFNumberRef cf_right_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+ UniqueCFNumberRef cf_top_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+ UniqueCFNumberRef cf_bottom_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+ UniqueCFNumberRef cf_stride_alignment(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPixelFormatTypeKey,
+ kCVPixelBufferWidthKey,
+ kCVPixelBufferHeightKey,
+ kCVPixelBufferExtendedPixelsLeftKey,
+ kCVPixelBufferExtendedPixelsRightKey,
+ kCVPixelBufferExtendedPixelsTopKey,
+ kCVPixelBufferExtendedPixelsBottomKey,
+ kCVPixelBufferBytesPerRowAlignmentKey,
+ };
+ const void* buffer_values[] = {
+ cf_pixel_format.get(), cf_width.get(),
+ cf_height.get(), cf_left_border.get(),
+ cf_right_border.get(), cf_top_border.get(),
+ cf_bottom_border.get(), cf_stride_alignment.get(),
+ };
+ UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (buffer_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ CVPixelBufferPoolRef cv_pool;
+ CVReturn ret = CVPixelBufferPoolCreate(
+ /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+ &cv_pool);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+ CVPixelBufferPoolRelease(pool_);
+ pool_ = cv_pool;
+ return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+ int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+ int /*height*/, int /*left_border*/, int /*right_border*/,
+ int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+ libgav1::FrameBuffer* frame_buffer) {
+ static_cast<void>(bitdepth);
+ assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+ image_format == libgav1::kImageFormatMonochrome400));
+ const bool is_monochrome =
+ (image_format == libgav1::kImageFormatMonochrome400);
+
+ // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+ // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+ // kCVReturnWouldExceedAllocationThreshold (-6689).
+ UniqueCFNumberRef cf_num_buffers(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPoolAllocationThresholdKey,
+ };
+ const void* buffer_values[] = {
+ cf_num_buffers.get(),
+ };
+ UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (aux_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ CVPixelBufferRef pixel_buffer;
+ CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+ /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr,
+ "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+
+ ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+ static_cast<int>(ret));
+ CFRelease(pixel_buffer);
+ return libgav1::kStatusUnknownError;
+ }
+
+ // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+ // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+ // CVPixelBufferGetPlaneCount returns 0), but
+ // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+ // still work for plane index 0, even though the documentation says they
+ // return NULL for nonplanar pixel buffers.
+ frame_buffer->stride[0] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+ frame_buffer->plane[0] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+ if (is_monochrome) {
+ frame_buffer->stride[1] = 0;
+ frame_buffer->stride[2] = 0;
+ frame_buffer->plane[1] = nullptr;
+ frame_buffer->plane[2] = nullptr;
+ } else {
+ frame_buffer->stride[1] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+ frame_buffer->stride[2] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+ frame_buffer->plane[1] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+ frame_buffer->plane[2] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+ }
+ frame_buffer->private_data = pixel_buffer;
+
+ return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+ void* buffer_private_data) {
+ auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+ CVReturn ret =
+ CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+ __FILE__, __LINE__, static_cast<int>(ret));
+ abort();
+ }
+ CFRelease(pixel_buffer);
+}
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+ static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+ size_t num_buffers);
+
+ // Not copyable or movable.
+ Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+ Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+ delete;
+
+ ~Gav1DecodeCVPixelBufferPool();
+
+ libgav1::StatusCode OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment);
+
+ libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+ libgav1::ImageFormat image_format,
+ int width, int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+ void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+ Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+ CVPixelBufferPoolRef pool_ = nullptr;
+ const int num_buffers_;
+};
+
+#endif // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+ size_t value = buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+ size_t value = buffer[3] << 24;
+ value |= buffer[2] << 16;
+ value |= buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+} // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+ IvfFileHeader* const ivf_file_header) {
+ if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+ if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+ return false;
+ }
+
+ // Verify header version and length.
+ const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+ if (ivf_header_version != kIvfHeaderVersion) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+ }
+
+ const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+ if (ivf_header_size != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+ return false;
+ }
+
+ if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+ memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+ return false;
+ }
+
+ ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+ ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+ ivf_file_header->frame_rate_numerator =
+ ReadLittleEndian32(&header_buffer[16]);
+ ivf_file_header->frame_rate_denominator =
+ ReadLittleEndian32(&header_buffer[20]);
+
+ return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+ IvfFrameHeader* const ivf_frame_header) {
+ if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+ ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+ if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+ return false;
+ }
+
+ ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+ const uint64_t timestamp_hi =
+ static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+ ivf_frame_header->timestamp |= timestamp_hi;
+
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+ IvfFileHeader() = default;
+ IvfFileHeader(const IvfFileHeader& rhs) = default;
+ IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+ IvfFileHeader(IvfFileHeader&& rhs) = default;
+ IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 0;
+ size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+ IvfFrameHeader() = default;
+ IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+ IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+ size_t frame_size = 0;
+ int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+ IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+ IvfFrameHeader* ivf_frame_header);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_IVF_PARSER_H_
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+ return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+if(NOT LIBGAV1_ENABLE_EXAMPLES)
+ macro(libgav1_add_examples_targets)
+
+ endmacro()
+ return()
+endif()
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+ "${libgav1_examples}/file_reader.h"
+ "${libgav1_examples}/file_reader_constants.cc"
+ "${libgav1_examples}/file_reader_constants.h"
+ "${libgav1_examples}/file_reader_factory.cc"
+ "${libgav1_examples}/file_reader_factory.h"
+ "${libgav1_examples}/file_reader_interface.h"
+ "${libgav1_examples}/ivf_parser.cc"
+ "${libgav1_examples}/ivf_parser.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+ "${libgav1_examples}/file_writer.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+ libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+ ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+ ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_executable(NAME
+ gav1_decode
+ SOURCES
+ ${libgav1_decode_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths}
+ OBJLIB_DEPS
+ libgav1_file_reader
+ libgav1_file_writer
+ LIB_DEPS
+ absl::strings
+ absl::str_format_internal
+ absl::time
+ ${libgav1_dependency})
+endmacro()
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+ return (offset == 0 || file_name[offset - 1] == '/' ||
+ file_name[offset - 1] == '\\')
+ ? file_name + offset
+ : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ constexpr const char* libgav1_examples_basename = \
+ libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+ __func__, error_string); \
+ } while (false)
+
+#else // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ } while (false)
+
+#endif // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+} // namespace examples
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_LOGGING_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Copies the feature_enabled, feature_data, segment_id_pre_skip, and
+// last_active_segment_id fields of Segmentation.
+void CopySegmentationParameters(const Segmentation& from, Segmentation* to) {
+ memcpy(to->feature_enabled, from.feature_enabled,
+ sizeof(to->feature_enabled));
+ memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data));
+ to->segment_id_pre_skip = from.segment_id_pre_skip;
+ to->last_active_segment_id = from.last_active_segment_id;
+}
+
+} // namespace
+
+RefCountedBuffer::RefCountedBuffer() = default;
+
+RefCountedBuffer::~RefCountedBuffer() = default;
+
+bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
+ int height, int subsampling_x, int subsampling_y,
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ // The YuvBuffer::Realloc() could call the get frame buffer callback which
+ // will need to be thread safe. So we ensure that we only call Realloc() once
+ // at any given time.
+ std::lock_guard<std::mutex> lock(pool_->mutex_);
+ assert(!buffer_private_data_valid_);
+ if (!yuv_buffer_.Realloc(
+ bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+ left_border, right_border, top_border, bottom_border,
+ pool_->get_frame_buffer_, pool_->callback_private_data_,
+ &buffer_private_data_)) {
+ return false;
+ }
+ buffer_private_data_valid_ = true;
+ return true;
+}
+
+bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
+ upscaled_width_ = frame_header.upscaled_width;
+ frame_width_ = frame_header.width;
+ frame_height_ = frame_header.height;
+ render_width_ = frame_header.render_width;
+ render_height_ = frame_header.render_height;
+ rows4x4_ = frame_header.rows4x4;
+ columns4x4_ = frame_header.columns4x4;
+ if (frame_header.refresh_frame_flags != 0 &&
+ !IsIntraFrame(frame_header.frame_type)) {
+ const int rows4x4_half = DivideBy2(rows4x4_);
+ const int columns4x4_half = DivideBy2(columns4x4_);
+ if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+ return false;
+ }
+ }
+ return segmentation_map_.Allocate(rows4x4_, columns4x4_);
+}
+
+void RefCountedBuffer::SetGlobalMotions(
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions) {
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ static_assert(sizeof(global_motion_[ref].params) ==
+ sizeof(global_motions[ref].params),
+ "");
+ memcpy(global_motion_[ref].params, global_motions[ref].params,
+ sizeof(global_motion_[ref].params));
+ }
+}
+
+void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) {
+ frame_context_ = context;
+ frame_context_.ResetIntraFrameYModeCdf();
+ frame_context_.ResetCounters();
+}
+
+void RefCountedBuffer::GetSegmentationParameters(
+ Segmentation* segmentation) const {
+ CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation);
+}
+
+void RefCountedBuffer::SetSegmentationParameters(
+ const Segmentation& segmentation) {
+ CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_);
+}
+
+void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; }
+
+void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) {
+ ptr->pool_->ReturnUnusedBuffer(ptr);
+}
+
+BufferPool::BufferPool(
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data) {
+ if (get_frame_buffer != nullptr) {
+ // on_frame_buffer_size_changed may be null.
+ assert(release_frame_buffer != nullptr);
+ on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+ get_frame_buffer_ = get_frame_buffer;
+ release_frame_buffer_ = release_frame_buffer;
+ callback_private_data_ = callback_private_data;
+ } else {
+ on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+ get_frame_buffer_ = GetInternalFrameBuffer;
+ release_frame_buffer_ = ReleaseInternalFrameBuffer;
+ callback_private_data_ = &internal_frame_buffers_;
+ }
+}
+
+BufferPool::~BufferPool() {
+ for (const auto* buffer : buffers_) {
+ if (buffer->in_use_) {
+ assert(false && "RefCountedBuffer still in use at destruction time.");
+ LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
+ }
+ delete buffer;
+ }
+}
+
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+ Libgav1ImageFormat image_format,
+ int width, int height,
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ if (on_frame_buffer_size_changed_ == nullptr) return true;
+ return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+ image_format, width, height, left_border,
+ right_border, top_border, bottom_border,
+ /*stride_alignment=*/16) == kStatusOk;
+}
+
+RefCountedBufferPtr BufferPool::GetFreeBuffer() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (!buffer->in_use_) {
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ buffer->hdr_cll_set_ = false;
+ buffer->hdr_mdcv_set_ = false;
+ buffer->itut_t35_set_ = false;
+ lock.unlock();
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+ }
+ }
+ lock.unlock();
+ auto* const buffer = new (std::nothrow) RefCountedBuffer();
+ if (buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+ return RefCountedBufferPtr();
+ }
+ buffer->SetBufferPool(this);
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ lock.lock();
+ const bool ok = buffers_.push_back(buffer);
+ lock.unlock();
+ if (!ok) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Failed to push the new reference counted buffer into the vector.");
+ delete buffer;
+ return RefCountedBufferPtr();
+ }
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
+
+void BufferPool::Abort() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (buffer->in_use_) {
+ buffer->Abort();
+ }
+ }
+}
+
+void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ assert(buffer->in_use_);
+ buffer->in_use_ = false;
+ if (buffer->buffer_private_data_valid_) {
+ release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+ buffer->buffer_private_data_valid_ = false;
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_BUFFER_POOL_H_
+#define LIBGAV1_SRC_BUFFER_POOL_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+class BufferPool;
+
+enum FrameState : uint8_t {
+ kFrameStateUnknown,
+ kFrameStateStarted,
+ kFrameStateParsed,
+ kFrameStateDecoded
+};
+
+// A reference-counted frame buffer. Clients should access it via
+// RefCountedBufferPtr, which manages reference counting transparently.
+// The alignment requirement is due to the SymbolDecoderContext member
+// frame_context_.
+class RefCountedBuffer : public MaxAlignedAllocable {
+ public:
+ // Not copyable or movable.
+ RefCountedBuffer(const RefCountedBuffer&) = delete;
+ RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
+
+ // Allocates the YUV buffer. Returns true on success. Returns false on
+ // failure. This function ensures the thread safety of the |get_frame_buffer_|
+ // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+ // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+ // be thread safe so that we can remove the thread safety of this function and
+ // applications can have fine grained locks.
+ //
+ // * |width| and |height| are the image dimensions in pixels.
+ // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+ // subsampling of the width and height of the chroma planes, respectively.
+ // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+ // the sizes (in pixels) of the borders on the left, right, top, and
+ // bottom sides, respectively.
+ //
+ // NOTE: The strides are a multiple of 16. Since the first row in each plane
+ // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+ bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int subsampling_x, int subsampling_y, int left_border,
+ int right_border, int top_border, int bottom_border);
+
+ YuvBuffer* buffer() { return &yuv_buffer_; }
+
+ // Returns the buffer private data set by the get frame buffer callback when
+ // it allocated the YUV buffer.
+ void* buffer_private_data() const {
+ assert(buffer_private_data_valid_);
+ return buffer_private_data_;
+ }
+
+ // NOTE: In the current frame, this is the frame_type syntax element in the
+ // frame header. In a reference frame, this implements the RefFrameType array
+ // in the spec.
+ FrameType frame_type() const { return frame_type_; }
+ void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
+
+ // The sample position for subsampled streams. This is the
+ // chroma_sample_position syntax element in the sequence header.
+ //
+ // NOTE: The decoder does not use chroma_sample_position, but it needs to be
+ // passed on to the client in DecoderBuffer.
+ ChromaSamplePosition chroma_sample_position() const {
+ return chroma_sample_position_;
+ }
+ void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
+ chroma_sample_position_ = chroma_sample_position;
+ }
+
+ // Whether the frame can be used as show existing frame in the future.
+ bool showable_frame() const { return showable_frame_; }
+ void set_showable_frame(bool value) { showable_frame_ = value; }
+
+ // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
+ // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
+ // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+ // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+ // success, false on failure.
+ bool SetFrameDimensions(const ObuFrameHeader& frame_header);
+
+ int32_t upscaled_width() const { return upscaled_width_; }
+ int32_t frame_width() const { return frame_width_; }
+ int32_t frame_height() const { return frame_height_; }
+ // RenderWidth() and RenderHeight() return the render size, which is a hint
+ // to the application about the desired display size.
+ int32_t render_width() const { return render_width_; }
+ int32_t render_height() const { return render_height_; }
+ int32_t rows4x4() const { return rows4x4_; }
+ int32_t columns4x4() const { return columns4x4_; }
+
+ int spatial_id() const { return spatial_id_; }
+ void set_spatial_id(int value) { spatial_id_ = value; }
+ int temporal_id() const { return temporal_id_; }
+ void set_temporal_id(int value) { temporal_id_ = value; }
+
+ ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; }
+ void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) {
+ hdr_cll_set_ = true;
+ hdr_cll_ = hdr_cll;
+ }
+ bool hdr_cll_set() const { return hdr_cll_set_; }
+
+ ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; }
+ void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) {
+ hdr_mdcv_set_ = true;
+ hdr_mdcv_ = hdr_mdcv;
+ }
+ bool hdr_mdcv_set() const { return hdr_mdcv_set_; }
+
+ ObuMetadataItutT35 itut_t35() const { return itut_t35_; }
+ bool set_itut_t35(const ObuMetadataItutT35& itut_t35,
+ const uint8_t* const payload) {
+ itut_t35_ = itut_t35;
+ if (itut_t35.payload_size > 0) {
+ if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false;
+ memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size);
+ itut_t35_.payload_bytes = itut_t35_payload_.get();
+ } else {
+ itut_t35_.payload_bytes = nullptr;
+ }
+ itut_t35_set_ = true;
+ return true;
+ }
+ bool itut_t35_set() const { return itut_t35_set_; }
+
+ SegmentationMap* segmentation_map() { return &segmentation_map_; }
+ const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
+
+ // Only the |params| field of each GlobalMotion struct should be used.
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& GlobalMotions()
+ const {
+ return global_motion_;
+ }
+ // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion
+ // struct is saved.
+ void SetGlobalMotions(
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions);
+
+ // Returns the saved CDF tables.
+ const SymbolDecoderContext& FrameContext() const { return frame_context_; }
+ // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the
+ // default. The last entry in each table, representing the symbol count for
+ // that context, is set to 0.
+ void SetFrameContext(const SymbolDecoderContext& context);
+
+ const std::array<int8_t, kNumReferenceFrameTypes>& loop_filter_ref_deltas()
+ const {
+ return loop_filter_ref_deltas_;
+ }
+ const std::array<int8_t, kLoopFilterMaxModeDeltas>& loop_filter_mode_deltas()
+ const {
+ return loop_filter_mode_deltas_;
+ }
+ // Saves the ref_deltas and mode_deltas arrays in loop_filter.
+ void SetLoopFilterDeltas(const LoopFilter& loop_filter) {
+ loop_filter_ref_deltas_ = loop_filter.ref_deltas;
+ loop_filter_mode_deltas_ = loop_filter.mode_deltas;
+ }
+
+ // Copies the saved values of the following fields to the Segmentation
+ // struct: feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id. The other fields are left unchanged.
+ void GetSegmentationParameters(Segmentation* segmentation) const;
+ // Saves the feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id fields of the Segmentation struct.
+ void SetSegmentationParameters(const Segmentation& segmentation);
+
+ const FilmGrainParams& film_grain_params() const {
+ return film_grain_params_;
+ }
+ void set_film_grain_params(const FilmGrainParams& params) {
+ film_grain_params_ = params;
+ }
+
+ const ReferenceInfo* reference_info() const { return &reference_info_; }
+ ReferenceInfo* reference_info() { return &reference_info_; }
+
+ // This will wake up the WaitUntil*() functions and make them return false.
+ void Abort() {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ abort_ = true;
+ }
+ parsed_condvar_.notify_all();
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+
+ void SetFrameState(FrameState frame_state) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ frame_state_ = frame_state;
+ }
+ if (frame_state == kFrameStateParsed) {
+ parsed_condvar_.notify_all();
+ } else if (frame_state == kFrameStateDecoded) {
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+ }
+
+ // Sets the progress of this frame to |progress_row| and notifies any threads
+ // that may be waiting on rows <= |progress_row|.
+ void SetProgress(int progress_row) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (progress_row_ >= progress_row) return;
+ progress_row_ = progress_row;
+ }
+ progress_row_condvar_.notify_all();
+ }
+
+ void MarkFrameAsStarted() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (frame_state_ != kFrameStateUnknown) return;
+ frame_state_ = kFrameStateStarted;
+ }
+
+ // All the WaitUntil* functions will return true if the desired wait state was
+ // reached successfully. If the return value is false, then the caller must
+ // assume that the wait was not successful and try to stop whatever they are
+ // doing as early as possible.
+
+ // Waits until the frame has been parsed.
+ bool WaitUntilParsed() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ < kFrameStateParsed && !abort_) {
+ parsed_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
+ // Waits until the |progress_row| has been decoded (as indicated either by
+ // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+ // nullptr and will be populated with the value of |progress_row_| after the
+ // wait.
+ //
+ // Typical usage of |progress_row_cache| is as follows:
+ // * Initialize |*progress_row_cache| to INT_MIN.
+ // * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+ bool WaitUntil(int progress_row, int* progress_row_cache) {
+ // If |progress_row| is negative, it means that the wait is on the top
+ // border to be available. The top border will be available when row 0 has
+ // been decoded. So we can simply wait on row 0 instead.
+ progress_row = std::max(progress_row, 0);
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+ !abort_) {
+ progress_row_condvar_.wait(lock);
+ }
+ // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+ // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+ // case.
+ *progress_row_cache =
+ (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+ return !abort_;
+ }
+
+ // Waits until the entire frame has been decoded.
+ bool WaitUntilDecoded() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ != kFrameStateDecoded && !abort_) {
+ decoded_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
+ private:
+ friend class BufferPool;
+
+ // Methods for BufferPool:
+ RefCountedBuffer();
+ ~RefCountedBuffer();
+ void SetBufferPool(BufferPool* pool);
+ static void ReturnToBufferPool(RefCountedBuffer* ptr);
+
+ BufferPool* pool_ = nullptr;
+ bool buffer_private_data_valid_ = false;
+ void* buffer_private_data_ = nullptr;
+ YuvBuffer yuv_buffer_;
+ bool in_use_ = false; // Only used by BufferPool.
+
+ std::mutex mutex_;
+ FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+ int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+ // Signaled when progress_row_ is updated or when frame_state_ is set to
+ // kFrameStateDecoded.
+ std::condition_variable progress_row_condvar_;
+ // Signaled when the frame state is set to kFrameStateParsed.
+ std::condition_variable parsed_condvar_;
+ // Signaled when the frame state is set to kFrameStateDecoded.
+ std::condition_variable decoded_condvar_;
+ bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
+
+ FrameType frame_type_ = kFrameKey;
+ ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+ bool showable_frame_ = false;
+
+ int32_t upscaled_width_ = 0;
+ int32_t frame_width_ = 0;
+ int32_t frame_height_ = 0;
+ int32_t render_width_ = 0;
+ int32_t render_height_ = 0;
+ int32_t columns4x4_ = 0;
+ int32_t rows4x4_ = 0;
+ int spatial_id_ = 0;
+ int temporal_id_ = 0;
+
+ ObuMetadataHdrCll hdr_cll_ = {};
+ bool hdr_cll_set_ = false; // Set to true when set_hdr_cll() is called.
+ ObuMetadataHdrMdcv hdr_mdcv_ = {};
+ bool hdr_mdcv_set_ = false; // Set to true when set_hdr_mdcv() is called.
+ ObuMetadataItutT35 itut_t35_ = {};
+ DynamicBuffer<uint8_t> itut_t35_payload_;
+ bool itut_t35_set_ = false; // Set to true when set_itut_t35() is called.
+
+ // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
+ SegmentationMap segmentation_map_;
+
+ // Only the |params| field of each GlobalMotion struct is used.
+ // global_motion_[0] (for kReferenceFrameIntra) is not used.
+ std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion_ = {};
+ SymbolDecoderContext frame_context_;
+ std::array<int8_t, kNumReferenceFrameTypes> loop_filter_ref_deltas_;
+ std::array<int8_t, kLoopFilterMaxModeDeltas> loop_filter_mode_deltas_;
+ // Only the feature_enabled, feature_data, segment_id_pre_skip, and
+ // last_active_segment_id fields of the Segmentation struct are used.
+ //
+ // Note: The spec only requires that we save feature_enabled and
+ // feature_data. Since segment_id_pre_skip and last_active_segment_id depend
+ // on feature_enabled only, we also save their values as an optimization.
+ Segmentation segmentation_ = {};
+ FilmGrainParams film_grain_params_ = {};
+ ReferenceInfo reference_info_;
+};
+
+// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
+//
+// Note: For simplicity, RefCountedBufferPtr is implemented as a
+// std::shared_ptr<RefCountedBuffer>. This requires a heap allocation of the
+// control block for std::shared_ptr. To avoid that heap allocation, we can
+// add a |ref_count_| field to RefCountedBuffer and implement a custom
+// RefCountedBufferPtr class.
+using RefCountedBufferPtr = std::shared_ptr<RefCountedBuffer>;
+
+// BufferPool maintains a pool of RefCountedBuffers.
+class BufferPool {
+ public:
+ BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data);
+
+ // Not copyable or movable.
+ BufferPool(const BufferPool&) = delete;
+ BufferPool& operator=(const BufferPool&) = delete;
+
+ ~BufferPool();
+
+ LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border);
+
+ // Finds a free buffer in the buffer pool and returns a reference to the free
+ // buffer. If there is no free buffer, returns a null pointer. This function
+ // is thread safe.
+ RefCountedBufferPtr GetFreeBuffer();
+
+ // Aborts all the buffers that are in use.
+ void Abort();
+
+ private:
+ friend class RefCountedBuffer;
+
+ // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
+ // only. This function is thread safe.
+ void ReturnUnusedBuffer(RefCountedBuffer* buffer);
+
+ // Used to make the following functions thread safe: GetFreeBuffer(),
+ // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+ std::mutex mutex_;
+
+ // Storing a RefCountedBuffer object in a Vector is complicated because of the
+ // copy/move semantics. So the simplest way around that is to store a list of
+ // pointers in the vector.
+ Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+ InternalFrameBufferList internal_frame_buffers_;
+
+ // Frame buffer callbacks.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+ GetFrameBufferCallback get_frame_buffer_;
+ ReleaseFrameBufferCallback release_frame_buffer_;
+ // Private data associated with the frame buffer callbacks.
+ void* callback_private_data_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_BUFFER_POOL_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <tuple>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/frame_buffer_utils.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BufferPoolTest, RefCountedBufferPtr) {
+ InternalFrameBufferList buffer_list;
+ BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+ GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+ &buffer_list);
+ RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+ EXPECT_NE(buffer_ptr, nullptr);
+ EXPECT_EQ(buffer_ptr.use_count(), 1);
+
+ RefCountedBufferPtr buffer_ptr2 = buffer_ptr;
+ RefCountedBufferPtr buffer_ptr3 = buffer_ptr;
+ EXPECT_EQ(buffer_ptr.use_count(), 3);
+ EXPECT_EQ(buffer_ptr2.use_count(), 3);
+ EXPECT_EQ(buffer_ptr3.use_count(), 3);
+
+ buffer_ptr2 = nullptr;
+ EXPECT_EQ(buffer_ptr.use_count(), 2);
+ EXPECT_EQ(buffer_ptr2.use_count(), 0);
+ EXPECT_EQ(buffer_ptr3.use_count(), 2);
+
+ RefCountedBufferPtr buffer_ptr4 = std::move(buffer_ptr);
+ EXPECT_EQ(buffer_ptr.use_count(), 0);
+ EXPECT_EQ(buffer_ptr2.use_count(), 0);
+ EXPECT_EQ(buffer_ptr3.use_count(), 2);
+ EXPECT_EQ(buffer_ptr4.use_count(), 2);
+}
+
+TEST(RefCountedBufferTest, SetFrameDimensions) {
+ InternalFrameBufferList buffer_list;
+ BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+ GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+ &buffer_list);
+ RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+ EXPECT_NE(buffer_ptr, nullptr);
+
+ // Test the undocumented default values of rows4x4() and columns4x4(). (Not
+ // sure if this is a good idea.)
+ EXPECT_EQ(buffer_ptr->rows4x4(), 0);
+ EXPECT_EQ(buffer_ptr->columns4x4(), 0);
+
+ // Test the side effects of SetFrameDimensions().
+ ObuFrameHeader frame_header = {};
+ frame_header.rows4x4 = 20;
+ frame_header.columns4x4 = 30;
+ EXPECT_TRUE(buffer_ptr->SetFrameDimensions(frame_header));
+ EXPECT_EQ(buffer_ptr->rows4x4(), 20);
+ EXPECT_EQ(buffer_ptr->columns4x4(), 30);
+}
+
+TEST(RefCountedBuffertTest, WaitUntil) {
+ InternalFrameBufferList buffer_list;
+ BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+ GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+ &buffer_list);
+ RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+ EXPECT_NE(buffer_ptr, nullptr);
+
+ int progress_row_cache;
+ buffer_ptr->SetProgress(10);
+ EXPECT_TRUE(buffer_ptr->WaitUntil(5, &progress_row_cache));
+ EXPECT_EQ(progress_row_cache, 10);
+
+ buffer_ptr->SetFrameState(kFrameStateDecoded);
+ EXPECT_TRUE(buffer_ptr->WaitUntil(500, &progress_row_cache));
+ EXPECT_EQ(progress_row_cache, INT_MAX);
+
+ buffer_ptr->Abort();
+ EXPECT_FALSE(buffer_ptr->WaitUntil(50, &progress_row_cache));
+}
+
+constexpr struct Params {
+ int width;
+ int height;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+ int border;
+} kParams[] = {
+ {1920, 1080, 1, 1, 96}, //
+ {1920, 1080, 1, 1, 64}, //
+ {1920, 1080, 1, 1, 32}, //
+ {1920, 1080, 1, 1, 160}, //
+ {1920, 1080, 1, 0, 160}, //
+ {1920, 1080, 0, 0, 160}, //
+};
+
+std::ostream& operator<<(std::ostream& os, const Params& param) {
+ return os << param.width << "x" << param.height
+ << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+ << "/" << static_cast<int>(param.subsampling_y)
+ << ", border: " << param.border;
+}
+
+class RefCountedBufferReallocTest
+ : public testing::TestWithParam<std::tuple<bool, Params>> {
+ protected:
+ const bool use_external_callbacks_ = std::get<0>(GetParam());
+ const Params& param_ = std::get<1>(GetParam());
+};
+
+TEST_P(RefCountedBufferReallocTest, 8Bit) {
+ InternalFrameBufferList buffer_list;
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+ GetFrameBufferCallback get_frame_buffer = nullptr;
+ ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+ void* callback_private_data = nullptr;
+ if (use_external_callbacks_) {
+ on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+ get_frame_buffer = GetInternalFrameBuffer;
+ release_frame_buffer = ReleaseInternalFrameBuffer;
+ callback_private_data = &buffer_list;
+ }
+
+ BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+ release_frame_buffer, callback_private_data);
+
+ RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+ EXPECT_NE(buffer_ptr, nullptr);
+
+ const Libgav1ImageFormat image_format = ComposeImageFormat(
+ /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+ EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+ /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+ param_.border, param_.border, param_.border));
+
+ EXPECT_TRUE(buffer_ptr->Realloc(
+ /*bitdepth=*/8, /*is_monochrome=*/false, param_.width, param_.height,
+ param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+ param_.border, param_.border));
+
+ // The first row of each plane is aligned at 16-byte boundaries.
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+ // Subsequent rows are aligned at 16-byte boundaries.
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+ // Check the borders.
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+ param_.border >> param_.subsampling_y);
+
+ // Write to the upper-left corner of the border.
+ uint8_t* y_buffer = buffer_ptr->buffer()->data(kPlaneY);
+ int y_stride = buffer_ptr->buffer()->stride(kPlaneY);
+ y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+ buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+ // Write to the lower-right corner of the border.
+ uint8_t* v_buffer = buffer_ptr->buffer()->data(kPlaneV);
+ int v_stride = buffer_ptr->buffer()->stride(kPlaneV);
+ v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+ buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+ v_stride +
+ buffer_ptr->buffer()->width(kPlaneV) +
+ buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(RefCountedBufferReallocTest, 10Bit) {
+ InternalFrameBufferList buffer_list;
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+ GetFrameBufferCallback get_frame_buffer = nullptr;
+ ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+ void* callback_private_data = nullptr;
+ if (use_external_callbacks_) {
+ on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+ get_frame_buffer = GetInternalFrameBuffer;
+ release_frame_buffer = ReleaseInternalFrameBuffer;
+ callback_private_data = &buffer_list;
+ }
+
+ BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+ release_frame_buffer, callback_private_data);
+
+ RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+ EXPECT_NE(buffer_ptr, nullptr);
+
+ const Libgav1ImageFormat image_format = ComposeImageFormat(
+ /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+ EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+ /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+ param_.border, param_.border, param_.border));
+
+ EXPECT_TRUE(buffer_ptr->Realloc(
+ /*bitdepth=*/10, /*is_monochrome=*/false, param_.width, param_.height,
+ param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+ param_.border, param_.border));
+
+ // The first row of each plane is aligned at 16-byte boundaries.
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+ EXPECT_EQ(
+ reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+ // Subsequent rows are aligned at 16-byte boundaries.
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+ EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+ // Check the borders.
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+ param_.border >> param_.subsampling_x);
+ EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+ param_.border >> param_.subsampling_y);
+ EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+ param_.border >> param_.subsampling_y);
+
+ // Write to the upper-left corner of the border.
+ auto* y_buffer =
+ reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneY));
+ int y_stride = buffer_ptr->buffer()->stride(kPlaneY) / sizeof(uint16_t);
+ y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+ buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+ // Write to the lower-right corner of the border.
+ auto* v_buffer =
+ reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneV));
+ int v_stride = buffer_ptr->buffer()->stride(kPlaneV) / sizeof(uint16_t);
+ v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+ buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+ v_stride +
+ buffer_ptr->buffer()->width(kPlaneV) +
+ buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(
+ Default, RefCountedBufferReallocTest,
+ testing::Combine(testing::Bool(), // use_external_callbacks
+ testing::ValuesIn(kParams)));
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/decoder.h"
+
+// Import the test frame #defines.
+#include "src/decoder_test_data.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b) \
+ do { \
+ if ((a) != (b)) { \
+ fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+ __FILE__, __LINE__); \
+ fprintf(stderr, "C DecoderTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_NE(a, b) \
+ do { \
+ if ((a) == (b)) { \
+ fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+ __FILE__, __LINE__); \
+ fprintf(stderr, "C DecoderTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_TRUE(a) \
+ do { \
+ if (!(a)) { \
+ fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+ __LINE__); \
+ fprintf(stderr, "C DecoderTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_FALSE(a) \
+ do { \
+ if (a) { \
+ fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+ __LINE__); \
+ fprintf(stderr, "C DecoderTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+static const uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+ OBU_FRAME_1};
+
+static const uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+static const uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+ OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+static const uint8_t kFrame2WithItutT35[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_METADATA_ITUT_T35, OBU_FRAME_2};
+
+typedef struct DecoderTest {
+ Libgav1Decoder* decoder;
+ int frames_in_use;
+ void* buffer_private_data;
+ void* released_input_buffer;
+} DecoderTest;
+
+static void DecoderTestInit(DecoderTest* test) {
+ test->decoder = NULL;
+ test->frames_in_use = 0;
+ test->buffer_private_data = NULL;
+ test->released_input_buffer = NULL;
+}
+
+static void DecoderTestIncrementFramesInUse(DecoderTest* test) {
+ ++test->frames_in_use;
+}
+
+static void DecoderTestDecrementFramesInUse(DecoderTest* test) {
+ --test->frames_in_use;
+}
+
+static void DecoderTestSetReleasedInputBuffer(DecoderTest* test,
+ void* released_input_buffer) {
+ test->released_input_buffer = released_input_buffer;
+}
+
+static void DecoderTestSetBufferPrivateData(DecoderTest* test,
+ void* buffer_private_data) {
+ test->buffer_private_data = buffer_private_data;
+}
+
+typedef struct FrameBufferPrivate {
+ uint8_t* data[3];
+} FrameBufferPrivate;
+
+static Libgav1StatusCode GetFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ Libgav1FrameBufferInfo info;
+ Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, &info);
+ if (status != kLibgav1StatusOk) return status;
+
+ FrameBufferPrivate* buffer_private =
+ (FrameBufferPrivate*)malloc(sizeof(FrameBufferPrivate));
+ if (buffer_private == NULL) return kLibgav1StatusOutOfMemory;
+
+ for (int i = 0; i < 3; ++i) {
+ const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+ buffer_private->data[i] = (uint8_t*)malloc(sizeof(uint8_t) * size);
+ if (buffer_private->data[i] == NULL) {
+ for (int j = 0; j < i; j++) {
+ free(buffer_private->data[j]);
+ }
+ free(buffer_private);
+ return kLibgav1StatusOutOfMemory;
+ }
+ }
+
+ uint8_t* const y_buffer = buffer_private->data[0];
+ uint8_t* const u_buffer =
+ (info.uv_buffer_size != 0) ? buffer_private->data[1] : NULL;
+ uint8_t* const v_buffer =
+ (info.uv_buffer_size != 0) ? buffer_private->data[2] : NULL;
+
+ status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+ buffer_private, frame_buffer);
+ if (status != kLibgav1StatusOk) return status;
+
+ DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+ DecoderTestIncrementFramesInUse(decoder_test);
+ DecoderTestSetBufferPrivateData(decoder_test, frame_buffer->private_data);
+ return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ FrameBufferPrivate* buffer_private = (FrameBufferPrivate*)buffer_private_data;
+ for (int i = 0; i < 3; ++i) {
+ free(buffer_private->data[i]);
+ }
+ free(buffer_private);
+ DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+ DecoderTestDecrementFramesInUse(decoder_test);
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+ DecoderTestSetReleasedInputBuffer((DecoderTest*)private_data, input_buffer);
+}
+
+static void DecoderTestSetUp(DecoderTest* test) {
+ Libgav1DecoderSettings settings;
+ Libgav1DecoderSettingsInitDefault(&settings);
+ settings.frame_parallel = 0; // false
+ settings.get_frame_buffer = GetFrameBuffer;
+ settings.release_frame_buffer = ReleaseFrameBuffer;
+ settings.callback_private_data = test;
+ settings.release_input_buffer = ReleaseInputBuffer;
+ ASSERT_EQ(test->decoder, NULL);
+ ASSERT_EQ(Libgav1DecoderCreate(&settings, &test->decoder), kLibgav1StatusOk);
+ ASSERT_NE(test->decoder, NULL);
+}
+
+static void DecoderTestAPIFlowForNonFrameParallelMode(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+ (uint8_t*)&kFrame1);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+ // So there should be no frames in use yet.
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+ // libgav1 has decoded frame1 and is holding a reference to it.
+ ASSERT_EQ(test.frames_in_use, 1);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+ (uint8_t*)&kFrame2);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Dequeue the output of frame2.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+ ASSERT_EQ(test.frames_in_use, 2);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Signal end of stream (method 1). This should ensure that all the references
+ // are released.
+ status = Libgav1DecoderSignalEOS(test.decoder);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ // libgav1 should have released all the reference frames now.
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Now, the decoder is ready to accept a new coded video sequence.
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+ (uint8_t*)&kFrame1);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+ (uint8_t*)&kFrame2);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Dequeue the output of frame2.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+ ASSERT_EQ(test.frames_in_use, 2);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Signal end of stream (method 2). This should ensure that all the references
+ // are released.
+ Libgav1DecoderDestroy(test.decoder);
+ test.decoder = NULL;
+
+ // libgav1 should have released all the frames now.
+ ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void
+DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+ (uint8_t*)&kFrame1);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ // Until the output of frame1 is dequeued, no other frames can be enqueued.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+ (uint8_t*)&kFrame2);
+ ASSERT_EQ(status, kLibgav1StatusTryAgain);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Delete the decoder instance.
+ Libgav1DecoderDestroy(test.decoder);
+ test.decoder = NULL;
+
+ ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+ (uint8_t*)&kFrame1);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+ (uint8_t*)&kFrame2);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Signal end of stream before dequeuing the output of frame2.
+ status = Libgav1DecoderSignalEOS(test.decoder);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ // In this case, the output of the last frame that was enqueued is lost (which
+ // is intentional since end of stream was signaled without dequeueing it).
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ Libgav1DecoderDestroy(test.decoder);
+ test.decoder = NULL;
+}
+
+static void DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer = NULL;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+ (uint8_t*)&kFrame1);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Signal end of stream.
+ status = Libgav1DecoderSignalEOS(test.decoder);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ // libgav1 should have released all the reference frames now.
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Now, the decoder is ready to accept a new coded video sequence. But, we
+ // try to enqueue a frame that does not have a sequence header (which is not
+ // allowed).
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+ (uint8_t*)&kFrame2);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame2 (this will fail since no sequence header has
+ // been seen since the last EOS signal).
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusBitstreamError);
+ ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ Libgav1DecoderDestroy(test.decoder);
+ test.decoder = NULL;
+}
+
+static void DecoderTestMetadataObu(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1WithHdrCllAndHdrMdcv,
+ sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+ (uint8_t*)&kFrame1WithHdrCllAndHdrMdcv);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(buffer->has_hdr_cll, 1);
+ ASSERT_EQ(buffer->has_hdr_mdcv, 1);
+ ASSERT_EQ(buffer->has_itut_t35, 0);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1WithHdrCllAndHdrMdcv);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2WithItutT35,
+ sizeof(kFrame2WithItutT35), 0,
+ (uint8_t*)&kFrame2WithItutT35);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Dequeue the output of frame2.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(buffer->has_hdr_cll, 0);
+ ASSERT_EQ(buffer->has_hdr_mdcv, 0);
+ ASSERT_EQ(buffer->has_itut_t35, 1);
+ ASSERT_NE(buffer->itut_t35.payload_bytes, NULL);
+ ASSERT_NE(buffer->itut_t35.payload_size, 0);
+ ASSERT_EQ(test.released_input_buffer, &kFrame2WithItutT35);
+
+ ASSERT_EQ(test.frames_in_use, 2);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ status = Libgav1DecoderSignalEOS(test.decoder);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ Libgav1DecoderDestroy(test.decoder);
+}
+
+int main(void) {
+ fprintf(stderr, "C DecoderTest started\n");
+ DecoderTestAPIFlowForNonFrameParallelMode();
+ DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing();
+ DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame();
+ DecoderTestNonFrameParallelModeInvalidFrameAfterEOS();
+ DecoderTestMetadataObu();
+ fprintf(stderr, "C DecoderTest passed\n");
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/version.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b) \
+ do { \
+ if ((a) != (b)) { \
+ fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+ __FILE__, __LINE__); \
+ fprintf(stderr, "C VersionTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_NE(a, b) \
+ do { \
+ if ((a) == (b)) { \
+ fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+ __FILE__, __LINE__); \
+ fprintf(stderr, "C VersionTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_TRUE(a) \
+ do { \
+ if (!(a)) { \
+ fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+ __LINE__); \
+ fprintf(stderr, "C VersionTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+#define ASSERT_FALSE(a) \
+ do { \
+ if (a) { \
+ fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+ __LINE__); \
+ fprintf(stderr, "C VersionTest failed\n"); \
+ exit(1); \
+ } \
+ } while (0)
+
+static void VersionTestGetVersion(void) {
+ const int library_version = Libgav1GetVersion();
+ ASSERT_EQ((library_version >> 24) & 0xff, 0);
+ // Note if we link against a shared object there's potential for a mismatch
+ // if a different library is loaded at runtime.
+ ASSERT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+ ASSERT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+ ASSERT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+ const int header_version = LIBGAV1_VERSION;
+ ASSERT_EQ((header_version >> 24) & 0xff, 0);
+ ASSERT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+ ASSERT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+ ASSERT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+static void VersionTestGetVersionString(void) {
+ const char* version = Libgav1GetVersionString();
+ ASSERT_NE(version, NULL);
+}
+
+static void VersionTestGetBuildConfiguration(void) {
+ const char* config = Libgav1GetBuildConfiguration();
+ ASSERT_NE(config, NULL);
+}
+
+int main(void) {
+ fprintf(stderr, "C VersionTest started\n");
+ VersionTestGetVersion();
+ VersionTestGetVersionString();
+ VersionTestGetBuildConfiguration();
+ fprintf(stderr, "C VersionTest passed\n");
+ return 0;
+}
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
+
+#include "src/decoder_impl.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+ Libgav1Decoder** decoder_out) {
+ std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+ libgav1::Decoder());
+ if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+ libgav1::DecoderSettings cxx_settings;
+ cxx_settings.threads = settings->threads;
+ cxx_settings.frame_parallel = settings->frame_parallel != 0;
+ cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+ cxx_settings.on_frame_buffer_size_changed =
+ settings->on_frame_buffer_size_changed;
+ cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+ cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+ cxx_settings.release_input_buffer = settings->release_input_buffer;
+ cxx_settings.callback_private_data = settings->callback_private_data;
+ cxx_settings.output_all_layers = settings->output_all_layers != 0;
+ cxx_settings.operating_point = settings->operating_point;
+ cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+ const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+ if (status == kLibgav1StatusOk) {
+ *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+ }
+ return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+ const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+ Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+ return libgav1::Decoder::GetMaxBitdepth();
+}
+
+} // extern "C"
+
+namespace libgav1 {
+
+Decoder::Decoder() = default;
+
+Decoder::~Decoder() = default;
+
+StatusCode Decoder::Init(const DecoderSettings* const settings) {
+ if (impl_ != nullptr) return kStatusAlready;
+ if (settings != nullptr) settings_ = *settings;
+ return DecoderImpl::Create(&settings_, &impl_);
+}
+
+StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ return impl_->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
+}
+
+StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ return impl_->DequeueFrame(out_ptr);
+}
+
+StatusCode Decoder::SignalEOS() {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ // In non-frame-parallel mode, we have to release all the references. This
+ // simply means replacing the |impl_| with a new instance so that all the
+ // existing references are released and the state is cleared.
+ impl_ = nullptr;
+ return DecoderImpl::Create(&settings_, &impl_);
+}
+
+// static.
+int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); }
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_buffer.h"
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+// Tests the emulation of C++ enumerators by constexpr constants.
+TEST(DecoderBufferTest, EnumTest) {
+ ColorRange color_range = kLibgav1ColorRangeFull;
+
+ // Verify that we get the -Wswitch warning unless the switch statement
+ // handles both kColorRangeStudio and kColorRangeFull:
+ // enumeration value 'kLibgav1ColorRangeFull' not handled in switch
+ switch (color_range) {
+ case kColorRangeStudio:
+ break;
+ case kColorRangeFull:
+ break;
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/decoder_impl.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iterator>
+#include <new>
+#include <utility>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/prediction_mask.h"
+#include "src/threading_strategy.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+ const bool do_superres, const int subsampling_y) {
+ int extra_border = 0;
+ if (do_cdef) {
+ extra_border += kCdefBorder;
+ } else if (do_restoration) {
+ // If CDEF is enabled, loop restoration is safe without extra border.
+ extra_border += kRestorationVerticalBorder;
+ }
+ if (do_superres) extra_border += kSuperResVerticalBorder;
+ // Double the number of extra bottom border pixels if the bottom border will
+ // be subsampled.
+ extra_border <<= subsampling_y;
+ return Align(kBorderPixels + extra_border, 2); // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+ int count) {
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ frame_scratch_buffer->tile_decoding_failed = true;
+ }
+ std::condition_variable* const condvars =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ for (int i = 0; i < count; ++i) {
+ condvars[i].notify_one();
+ }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+ FrameScratchBufferReleaser(
+ FrameScratchBufferPool* frame_scratch_buffer_pool,
+ std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+ : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+ frame_scratch_buffer_(frame_scratch_buffer) {}
+ ~FrameScratchBufferReleaser() {
+ frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+ }
+
+ private:
+ FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+ std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+ const SegmentationMap* prev_segment_ids,
+ RefCountedBuffer* const frame) {
+ if (!frame_header.segmentation.enabled) {
+ // All segment_id's are 0.
+ frame->segmentation_map()->Clear();
+ } else if (!frame_header.segmentation.update_map) {
+ // Copy from prev_segment_ids.
+ if (prev_segment_ids == nullptr) {
+ // Treat a null prev_segment_ids pointer as if it pointed to a
+ // segmentation map containing all 0s.
+ frame->segmentation_map()->Clear();
+ } else {
+ frame->segmentation_map()->CopyFrom(*prev_segment_ids);
+ }
+ }
+}
+
+StatusCode DecodeTilesNonFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter) {
+ // Decode in superblock row order.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ row4x4, tile_scratch_buffer.get())) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+ post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ }
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter,
+ BlockingCounterWithStatus* const pending_tiles) {
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ const int num_workers = threading_strategy.tile_thread_count();
+ BlockingCounterWithStatus pending_workers(num_workers);
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ bool tile_decoding_failed = false;
+ // Submit tile decoding jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+ &tile_counter,
+ &pending_workers,
+ &pending_tiles]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ pending_workers.Decrement(!failed);
+ });
+ }
+ // Have the current thread partake in tile decoding.
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!tile_decoding_failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ tile_decoding_failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ // Wait until all the workers are done. This ensures that all the tiles have
+ // been parsed.
+ tile_decoding_failed |= !pending_workers.Wait();
+ // Wait until all the tiles have been decoded.
+ tile_decoding_failed |= !pending_tiles->Wait();
+ if (tile_decoding_failed) return kStatusUnknownError;
+ assert(threading_strategy.post_filter_thread_pool() != nullptr);
+ post_filter->ApplyFilteringThreaded();
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ for (const auto& tile : tiles) {
+ if (!tile->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+ return kStatusUnknownError;
+ }
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ // Mark frame as parsed.
+ current_frame->SetFrameState(kFrameStateParsed);
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) {
+ return kStatusOutOfMemory;
+ }
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ // Decode in superblock row order (inter prediction in the Tile class will
+ // block until the required superblocks in the reference frame are decoded).
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, tile_scratch_buffer.get())) {
+ LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+ tile_ptr->number());
+ return kStatusUnknownError;
+ }
+ }
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Mark frame as decoded (we no longer care about row-level progress since the
+ // entire frame has been decoded).
+ current_frame->SetFrameState(kFrameStateDecoded);
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+ PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+ const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+ int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+ // Apply vertical deblock filtering for the first 64 columns of each tile.
+ for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ if (decode_entire_tiles_in_worker_threads &&
+ row4x4 == tile_row_base[0]->row4x4_start()) {
+ // This is the first superblock row of a tile row. In this case, apply
+ // horizontal deblock filtering for the entire superblock row.
+ post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+ frame_header.columns4x4, block_width4x4);
+ } else {
+ // Apply horizontal deblock filtering for the first 64 columns of the
+ // first tile.
+ const Tile& first_tile = *tile_row_base[0];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+ first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // previous tile and the first 64 columns of the current tile.
+ for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ // If the previous tile has more than 64 columns, then include those
+ // for the horizontal deblock.
+ const Tile& previous_tile = *tile_row_base[tile_column - 1];
+ const int column4x4_start =
+ tile.column4x4_start() -
+ ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+ previous_tile.column4x4_start())
+ ? kNum4x4InLoopFilterUnit
+ : 0);
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // last tile.
+ const Tile& last_tile = *tile_row_base[tile_columns - 1];
+ // Identify the last column4x4 value and do horizontal filtering for
+ // that column4x4. The value of last column4x4 is the nearest multiple
+ // of 16 that is before tile.column4x4_end().
+ const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+ // If column4x4_start is the same as tile.column4x4_start() then it
+ // means that the last tile has <= 64 columns. So there is nothing left
+ // to deblock (since it was already deblocked in the loop above).
+ if (column4x4_start != last_tile.column4x4_start()) {
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ last_tile.column4x4_end(), block_width4x4);
+ }
+ }
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+// * Schedule the next superblock row in the current tile column for decoding
+// (the next superblock row may be in a different tile than the current
+// one).
+// * If an entire superblock row of the frame has been decoded, it notifies
+// the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+ const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+ const int superblock_size4x4, const int tile_columns,
+ const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (scratch_buffer == nullptr) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ Tile& tile = *tiles[tile_index];
+ const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get());
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(scratch_buffer));
+ if (!ok) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile except
+ // for the first 64 columns.
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+ superblock_size4x4);
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile, column4x4_end
+ // may not be a multiple of 16. In that case it is still okay to simply
+ // subtract 16 since ApplyDeblockFilter() will only do the filters in
+ // increments of 64 columns (or 32 columns for chroma with subsampling).
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+ tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+ }
+ const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+ const int index = row4x4 >> superblock_size4x4_log2;
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ bool notify;
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ notify = ++superblock_row_progress[index] == tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ // Schedule the next superblock row (if one exists).
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ const int next_row4x4 = row4x4 + superblock_size4x4;
+ if (!tile.IsRow4x4Inside(next_row4x4)) {
+ tile_index += tile_columns;
+ }
+ if (tile_index >= tiles.size()) return;
+ pending_jobs->IncrementBy(1);
+ thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+ tile_columns, superblock_rows, frame_scratch_buffer,
+ post_filter, pending_jobs]() {
+ DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+ superblock_size4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, pending_jobs);
+ pending_jobs->Decrement();
+ });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ const int num_workers = thread_pool.num_threads();
+ BlockingCounterWithStatus parse_workers(num_workers);
+ // Submit tile parsing jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+ parse_workers.Decrement(!failed);
+ });
+ }
+
+ // Have the current thread participate in parsing.
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+
+ // Wait until all the parse workers are done. This ensures that all the tiles
+ // have been parsed.
+ if (!parse_workers.Wait() || failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ current_frame->SetFrameState(kFrameStateParsed);
+
+ // Decode the frame.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header.use_128x128_superblock ? 5 : 4;
+ const int superblock_rows =
+ (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+ if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+ !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+ superblock_rows)) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ memset(superblock_row_progress, 0,
+ superblock_rows * sizeof(superblock_row_progress[0]));
+ frame_scratch_buffer->tile_decoding_failed = false;
+ const int tile_columns = frame_header.tile_info.tile_columns;
+ const bool decode_entire_tiles_in_worker_threads =
+ num_workers >= tile_columns;
+ BlockingCounter pending_jobs(
+ decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+ if (decode_entire_tiles_in_worker_threads) {
+ // Submit tile decoding jobs to the thread pool.
+ tile_counter = 0;
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+ frame_scratch_buffer, superblock_rows]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (failed) continue;
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Decode(
+ &frame_scratch_buffer->superblock_row_mutex,
+ frame_scratch_buffer->superblock_row_progress.get(),
+ frame_scratch_buffer->superblock_row_progress_condvar
+ .get())) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ }
+ }
+ pending_jobs.Decrement();
+ });
+ }
+ } else {
+ // Schedule the jobs for first tile row.
+ for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+ thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+ superblock_rows, frame_scratch_buffer, post_filter,
+ &pending_jobs]() {
+ DecodeSuperBlockRowInTile(
+ tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, &pending_jobs);
+ pending_jobs.Decrement();
+ });
+ }
+ }
+
+ // Current thread will do the post filters.
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+ for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4, ++index) {
+ if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+ tile_row_base += tile_columns;
+ }
+ {
+ std::unique_lock<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ while (superblock_row_progress[index] != tile_columns &&
+ !frame_scratch_buffer->tile_decoding_failed) {
+ superblock_row_progress_condvar[index].wait(lock);
+ }
+ if (frame_scratch_buffer->tile_decoding_failed) break;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply deblocking filter for the tile boundaries of this superblock row.
+ // The deblocking filter for the internal blocks will be applied in the
+ // tile worker threads. In this thread, we will only have to apply
+ // deblocking filter for the tile boundaries.
+ ApplyDeblockingFilterForTileBoundaries(
+ post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+ tile_columns, decode_entire_tiles_in_worker_threads);
+ }
+ // Apply all the post filters other than deblocking.
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/false);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Wait until all the pending jobs are done. This ensures that all the tiles
+ // have been decoded and wrapped up.
+ pending_jobs.Wait();
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ if (frame_scratch_buffer->tile_decoding_failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+
+ current_frame->SetFrameState(kFrameStateDecoded);
+ return kStatusOk;
+}
+
+} // namespace
+
+// static
+StatusCode DecoderImpl::Create(const DecoderSettings* settings,
+ std::unique_ptr<DecoderImpl>* output) {
+ if (settings->threads <= 0) {
+ LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
+ return kStatusInvalidArgument;
+ }
+ if (settings->frame_parallel) {
+ if (settings->release_input_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "release_input_buffer callback must not be null when "
+ "frame_parallel is true.");
+ return kStatusInvalidArgument;
+ }
+ }
+ std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
+ if (impl == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
+ return kStatusOutOfMemory;
+ }
+ const StatusCode status = impl->Init();
+ if (status != kStatusOk) return status;
+ *output = std::move(impl);
+ return kStatusOk;
+}
+
+DecoderImpl::DecoderImpl(const DecoderSettings* settings)
+ : buffer_pool_(settings->on_frame_buffer_size_changed,
+ settings->get_frame_buffer, settings->release_frame_buffer,
+ settings->callback_private_data),
+ settings_(*settings) {
+ dsp::DspInit();
+}
+
+DecoderImpl::~DecoderImpl() {
+ // Clean up and wait until all the threads have stopped. We just have to pass
+ // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+ // path that clears all the threads and structs.
+ SignalFailure(kStatusUnknownError);
+ // Release any other frame buffer references that we may be holding on to.
+ ReleaseOutputFrame();
+ output_frame_queue_.Clear();
+ for (auto& reference_frame : state_.reference_frame) {
+ reference_frame = nullptr;
+ }
+}
+
+StatusCode DecoderImpl::Init() {
+ if (!output_frame_queue_.Init(kMaxLayers)) {
+ LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+ const uint8_t* data, size_t size) {
+ is_frame_parallel_ = false;
+ if (settings_.frame_parallel) {
+ DecoderState state;
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ data, size, settings_.operating_point, &buffer_pool_, &state));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ RefCountedBufferPtr current_frame;
+ const StatusCode status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ current_frame = nullptr;
+ // We assume that the first frame that was parsed will contain the frame
+ // header. This assumption is usually true in practice. So we will simply
+ // not use frame parallel mode if this is not the case.
+ if (settings_.threads > 1 &&
+ !InitializeThreadPoolsForFrameParallel(
+ settings_.threads, obu->frame_header().tile_info.tile_count,
+ obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+ &frame_scratch_buffer_pool_)) {
+ return kStatusOutOfMemory;
+ }
+ }
+ const int max_allowed_frames =
+ (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+ assert(max_allowed_frames > 0);
+ if (!temporal_units_.Init(max_allowed_frames)) {
+ LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ is_frame_parallel_ = frame_thread_pool_ != nullptr;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (data == nullptr || size == 0) return kStatusInvalidArgument;
+ if (HasFailure()) return kStatusUnknownError;
+ if (!seen_first_frame_) {
+ seen_first_frame_ = true;
+ const StatusCode status =
+ InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+ if (status != kStatusOk) {
+ return SignalFailure(status);
+ }
+ }
+ if (temporal_units_.Full()) {
+ return kStatusTryAgain;
+ }
+ if (is_frame_parallel_) {
+ return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+ }
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ temporal_units_.Push(std::move(temporal_unit));
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+ if (status == kStatusOk || status == kStatusTryAgain) return status;
+ // Set the |failure_status_| first so that any pending jobs in
+ // |frame_thread_pool_| will exit right away when the thread pool is being
+ // released below.
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ failure_status_ = status;
+ }
+ // Make sure all waiting threads exit.
+ buffer_pool_.Abort();
+ frame_thread_pool_ = nullptr;
+ while (!temporal_units_.Empty()) {
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(
+ settings_.callback_private_data,
+ temporal_units_.Front().buffer_private_data);
+ }
+ temporal_units_.Pop();
+ }
+ return status;
+}
+
+// DequeueFrame() follows the following policy to avoid holding unnecessary
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
+StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
+ if (out_ptr == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
+ return kStatusInvalidArgument;
+ }
+ // We assume a call to DequeueFrame() indicates that the caller is no longer
+ // using the previous output frame, so we can release it.
+ ReleaseOutputFrame();
+ if (temporal_units_.Empty()) {
+ // No input frames to decode.
+ *out_ptr = nullptr;
+ return kStatusNothingToDequeue;
+ }
+ TemporalUnit& temporal_unit = temporal_units_.Front();
+ if (!is_frame_parallel_) {
+ // If |output_frame_queue_| is not empty, then return the first frame from
+ // that queue.
+ if (!output_frame_queue_.Empty()) {
+ RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ const StatusCode status = CopyFrameToOutputBuffer(frame);
+ if (status != kStatusOk) {
+ return status;
+ }
+ *out_ptr = &buffer_;
+ return kStatusOk;
+ }
+ // Decode the next available temporal unit and return.
+ const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+ if (status != kStatusOk) {
+ // In case of failure, discard all the output frames that we may be
+ // holding on references to.
+ output_frame_queue_.Clear();
+ }
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ return status;
+ }
+ {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (settings_.blocking_dequeue) {
+ while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ decoded_condvar_.wait(lock);
+ }
+ } else {
+ if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ return kStatusTryAgain;
+ }
+ }
+ if (failure_status_ != kStatusOk) {
+ const StatusCode failure_status = failure_status_;
+ lock.unlock();
+ return SignalFailure(failure_status);
+ }
+ }
+ if (settings_.release_input_buffer != nullptr &&
+ !temporal_unit.released_input_buffer) {
+ temporal_unit.released_input_buffer = true;
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (temporal_unit.status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(temporal_unit.status);
+ }
+ if (!temporal_unit.has_displayable_frame) {
+ *out_ptr = nullptr;
+ temporal_units_.Pop();
+ return kStatusOk;
+ }
+ assert(temporal_unit.output_layer_count > 0);
+ StatusCode status = CopyFrameToOutputBuffer(
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+ nullptr;
+ if (status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(status);
+ }
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ *out_ptr = &buffer_;
+ if (--temporal_unit.output_layer_count == 0) {
+ temporal_units_.Pop();
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ int position_in_temporal_unit = 0;
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ // TODO(vigneshv): This may not be the right place to call this callback
+ // for the frame parallel case. Investigate and fix it.
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
+ }
+ // This can happen when there are multiple spatial/temporal layers and if
+ // all the layers are outside the current operating point.
+ if (current_frame == nullptr) {
+ continue;
+ }
+ // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+ // in the code below after |temporal_unit| is std::move'd into the
+ // |temporal_units_| queue.
+ if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+ position_in_temporal_unit++)) {
+ LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+ return kStatusOutOfMemory;
+ }
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
+ }
+ // This function cannot fail after this point. So it is okay to move the
+ // |temporal_unit| into |temporal_units_| queue.
+ temporal_units_.Push(std::move(temporal_unit));
+ if (temporal_units_.Back().frames.empty()) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ temporal_units_.Back().has_displayable_frame = false;
+ temporal_units_.Back().decoded = true;
+ return kStatusOk;
+ }
+ for (auto& frame : temporal_units_.Back().frames) {
+ EncodedFrame* const encoded_frame = &frame;
+ encoded_frame->temporal_unit = &temporal_units_.Back();
+ frame_thread_pool_->Schedule([this, encoded_frame]() {
+ if (HasFailure()) return;
+ const StatusCode status = DecodeFrame(encoded_frame);
+ encoded_frame->state = {};
+ encoded_frame->frame = nullptr;
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (failure_status_ != kStatusOk) return;
+ // temporal_unit's status defaults to kStatusOk. So we need to set it only
+ // on error. If |failure_status_| is not kStatusOk at this point, it means
+ // that there has already been a failure. So we don't care about this
+ // subsequent failure. We will simply return the error code of the first
+ // failure.
+ if (status != kStatusOk) {
+ temporal_unit.status = status;
+ if (failure_status_ == kStatusOk) {
+ failure_status_ = status;
+ }
+ }
+ temporal_unit.decoded =
+ ++temporal_unit.decoded_count == temporal_unit.frames.size();
+ if (temporal_unit.decoded && settings_.output_all_layers &&
+ temporal_unit.output_layer_count > 1) {
+ std::sort(
+ temporal_unit.output_layers,
+ temporal_unit.output_layers + temporal_unit.output_layer_count);
+ }
+ if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+ decoded_condvar_.notify_one();
+ }
+ });
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+ const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+ const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+ RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ StatusCode status;
+ if (!frame_header.show_existing_frame) {
+ if (encoded_frame->tile_buffers.empty()) {
+ // This means that the last call to ParseOneFrame() did not actually
+ // have any tile groups. This could happen in rare cases (for example,
+ // if there is a Metadata OBU after the TileGroup OBU). We currently do
+ // not have a reason to handle those cases, so we simply continue.
+ return kStatusOk;
+ }
+ status = DecodeTiles(sequence_header, frame_header,
+ encoded_frame->tile_buffers, encoded_frame->state,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
+ return status;
+ }
+ } else {
+ if (!current_frame->WaitUntilDecoded()) {
+ return kStatusUnknownError;
+ }
+ }
+ if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+ // This frame is not displayable. Not an error.
+ return kStatusOk;
+ }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ sequence_header, frame_header, current_frame, &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.thread_pool());
+ if (status != kStatusOk) {
+ return status;
+ }
+
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+ assert(temporal_unit.output_frame_position >= 0);
+ // A displayable frame was already found in this temporal unit. This can
+ // happen if there are multiple spatial/temporal layers. Since
+ // |settings_.output_all_layers| is false, we will output only the last
+ // displayable frame.
+ if (temporal_unit.output_frame_position >
+ encoded_frame->position_in_temporal_unit) {
+ return kStatusOk;
+ }
+ // Replace any output frame that we may have seen before with the current
+ // frame.
+ assert(temporal_unit.output_layer_count == 1);
+ --temporal_unit.output_layer_count;
+ }
+ temporal_unit.has_displayable_frame = true;
+ temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+ std::move(film_grain_frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count]
+ .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+ ++temporal_unit.output_layer_count;
+ temporal_unit.output_frame_position =
+ encoded_frame->position_in_temporal_unit;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr) {
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
+ }
+ if (!obu->frame_header().show_existing_frame) {
+ if (obu->tile_buffers().empty()) {
+ // This means that the last call to ParseOneFrame() did not actually
+ // have any tile groups. This could happen in rare cases (for example,
+ // if there is a Metadata OBU after the TileGroup OBU). We currently do
+ // not have a reason to handle those cases, so we simply continue.
+ continue;
+ }
+ status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+ obu->tile_buffers(), state_,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
+ return status;
+ }
+ }
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
+ if (obu->frame_header().show_frame ||
+ obu->frame_header().show_existing_frame) {
+ if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+ // There is more than one displayable frame in the current operating
+ // point and |settings_.output_all_layers| is false. In this case, we
+ // simply return the last displayable frame as the output frame and
+ // ignore the rest.
+ assert(output_frame_queue_.Size() == 1);
+ output_frame_queue_.Pop();
+ }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ obu->sequence_header(), obu->frame_header(), current_frame,
+ &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+ if (status != kStatusOk) return status;
+ output_frame_queue_.Push(std::move(film_grain_frame));
+ }
+ }
+ if (output_frame_queue_.Empty()) {
+ // No displayable frame in the temporal unit. Not an error.
+ *out_ptr = nullptr;
+ return kStatusOk;
+ }
+ status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ if (status != kStatusOk) {
+ return status;
+ }
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ *out_ptr = &buffer_;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::CopyFrameToOutputBuffer(
+ const RefCountedBufferPtr& frame) {
+ YuvBuffer* yuv_buffer = frame->buffer();
+
+ buffer_.chroma_sample_position = frame->chroma_sample_position();
+
+ if (yuv_buffer->is_monochrome()) {
+ buffer_.image_format = kImageFormatMonochrome400;
+ } else {
+ if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
+ buffer_.image_format = kImageFormatYuv444;
+ } else if (yuv_buffer->subsampling_x() == 1 &&
+ yuv_buffer->subsampling_y() == 0) {
+ buffer_.image_format = kImageFormatYuv422;
+ } else if (yuv_buffer->subsampling_x() == 1 &&
+ yuv_buffer->subsampling_y() == 1) {
+ buffer_.image_format = kImageFormatYuv420;
+ } else {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid chroma subsampling values: cannot determine buffer "
+ "image format.");
+ return kStatusInvalidArgument;
+ }
+ }
+ buffer_.color_range = sequence_header_.color_config.color_range;
+ buffer_.color_primary = sequence_header_.color_config.color_primary;
+ buffer_.transfer_characteristics =
+ sequence_header_.color_config.transfer_characteristics;
+ buffer_.matrix_coefficients =
+ sequence_header_.color_config.matrix_coefficients;
+
+ buffer_.bitdepth = yuv_buffer->bitdepth();
+ const int num_planes =
+ yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
+ int plane = kPlaneY;
+ for (; plane < num_planes; ++plane) {
+ buffer_.stride[plane] = yuv_buffer->stride(plane);
+ buffer_.plane[plane] = yuv_buffer->data(plane);
+ buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+ buffer_.displayed_height[plane] = yuv_buffer->height(plane);
+ }
+ for (; plane < kMaxPlanes; ++plane) {
+ buffer_.stride[plane] = 0;
+ buffer_.plane[plane] = nullptr;
+ buffer_.displayed_width[plane] = 0;
+ buffer_.displayed_height[plane] = 0;
+ }
+ buffer_.spatial_id = frame->spatial_id();
+ buffer_.temporal_id = frame->temporal_id();
+ buffer_.buffer_private_data = frame->buffer_private_data();
+ if (frame->hdr_cll_set()) {
+ buffer_.has_hdr_cll = 1;
+ buffer_.hdr_cll = frame->hdr_cll();
+ } else {
+ buffer_.has_hdr_cll = 0;
+ }
+ if (frame->hdr_mdcv_set()) {
+ buffer_.has_hdr_mdcv = 1;
+ buffer_.hdr_mdcv = frame->hdr_mdcv();
+ } else {
+ buffer_.has_hdr_mdcv = 0;
+ }
+ if (frame->itut_t35_set()) {
+ buffer_.has_itut_t35 = 1;
+ buffer_.itut_t35 = frame->itut_t35();
+ } else {
+ buffer_.has_itut_t35 = 0;
+ }
+ output_frame_ = frame;
+ return kStatusOk;
+}
+
+void DecoderImpl::ReleaseOutputFrame() {
+ for (auto& plane : buffer_.plane) {
+ plane = nullptr;
+ }
+ output_frame_ = nullptr;
+}
+
+StatusCode DecoderImpl::DecodeTiles(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+ RefCountedBuffer* const current_frame) {
+ frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+ sequence_header.color_config.bitdepth);
+ if (!frame_scratch_buffer->loop_restoration_info.Reset(
+ &frame_header.loop_restoration, frame_header.upscaled_width,
+ frame_header.height, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.is_monochrome)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate memory for loop restoration info units.");
+ return kStatusOutOfMemory;
+ }
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ if (!is_frame_parallel_ &&
+ !threading_strategy.Reset(frame_header, settings_.threads)) {
+ return kStatusOutOfMemory;
+ }
+ const bool do_cdef =
+ PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+ const int num_planes = sequence_header.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ const bool do_restoration = PostFilter::DoRestoration(
+ frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+ const bool do_superres =
+ PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+ // Use kBorderPixels for the left, right, and top borders. Only the bottom
+ // border may need to be bigger. Cdef border is needed only if we apply Cdef
+ // without multithreading.
+ const int bottom_border = GetBottomBorderPixels(
+ do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+ do_restoration, do_superres, sequence_header.color_config.subsampling_y);
+ current_frame->set_chroma_sample_position(
+ sequence_header.color_config.chroma_sample_position);
+ if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, frame_header.height,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ /*left_border=*/kBorderPixels,
+ /*right_border=*/kBorderPixels,
+ /*top_border=*/kBorderPixels, bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
+ return kStatusOutOfMemory;
+ }
+ if (frame_header.cdef.bits > 0) {
+ if (!frame_scratch_buffer->cdef_index.Reset(
+ DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+ DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
+ return kStatusOutOfMemory;
+ }
+ }
+ if (do_cdef) {
+ if (!frame_scratch_buffer->cdef_skip.Reset(
+ DivideBy2(frame_header.rows4x4 + kMaxBlockHeight4x4),
+ DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+ /*zero_initialize=*/true)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef skip.");
+ return kStatusOutOfMemory;
+ }
+ }
+ if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
+ return kStatusOutOfMemory;
+ }
+ if (frame_header.use_ref_frame_mvs) {
+ if (!frame_scratch_buffer->motion_field.mv.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false) ||
+ !frame_scratch_buffer->motion_field.reference_offset.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate memory for temporal motion vectors.");
+ return kStatusOutOfMemory;
+ }
+
+ // For each motion vector, only mv[0] needs to be initialized to
+ // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+ // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ MotionVector invalid_mv;
+ invalid_mv.mv[0] = kInvalidMvValue;
+ invalid_mv.mv[1] = 0;
+ MotionVector* const motion_field_mv =
+ &frame_scratch_buffer->motion_field.mv[0][0];
+ std::fill(motion_field_mv,
+ motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+ invalid_mv);
+ }
+
+ // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
+ // that the block parameters cache can be filled in for the last row/column
+ // without having to check for boundary conditions.
+ if (!frame_scratch_buffer->block_parameters_holder.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4)) {
+ return kStatusOutOfMemory;
+ }
+ const dsp::Dsp* const dsp =
+ dsp::GetDspTable(sequence_header.color_config.bitdepth);
+ if (dsp == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
+ sequence_header.color_config.bitdepth);
+ return kStatusInternalError;
+ }
+
+ const int tile_count = frame_header.tile_info.tile_count;
+ assert(tile_count >= 1);
+ Vector<std::unique_ptr<Tile>> tiles;
+ if (!tiles.reserve(tile_count)) {
+ LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
+ return kStatusOutOfMemory;
+ }
+
+ if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+ frame_scratch_buffer->residual_buffer_pool.reset(
+ new (std::nothrow) ResidualBufferPool(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t)));
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
+ return kStatusOutOfMemory;
+ }
+ } else {
+ frame_scratch_buffer->residual_buffer_pool->Reset(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t));
+ }
+ }
+
+ if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->cdef_border.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_restoration &&
+ (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->loop_restoration_border.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, num_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_superres) {
+ const int pixel_size = sequence_header.color_config.bitdepth == 8
+ ? sizeof(uint8_t)
+ : sizeof(uint16_t);
+ const int coefficients_size = kSuperResFilterTaps *
+ Align(frame_header.upscaled_width, 16) *
+ pixel_size;
+ if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+ coefficients_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeY].");
+ return kStatusOutOfMemory;
+ }
+#if LIBGAV1_MSAN
+ // Quiet SuperRes_NEON() msan warnings.
+ memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(), 0,
+ coefficients_size);
+#endif
+ const int uv_coefficients_size =
+ kSuperResFilterTaps *
+ Align(SubsampledValue(frame_header.upscaled_width, 1), 16) * pixel_size;
+ if (!sequence_header.color_config.is_monochrome &&
+ sequence_header.color_config.subsampling_x != 0 &&
+ !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+ uv_coefficients_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+ return kStatusOutOfMemory;
+ }
+#if LIBGAV1_MSAN
+ if (!sequence_header.color_config.is_monochrome &&
+ sequence_header.color_config.subsampling_x != 0) {
+ // Quiet SuperRes_NEON() msan warnings.
+ memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].get(), 0,
+ uv_coefficients_size);
+ }
+#endif
+ }
+
+ if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
+ const int num_threads =
+ threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_threads| rows of the
+ // down-scaled pixels.
+ // Left and right borders are for line extension. They are doubled for the Y
+ // plane to make sure the U and V planes have enough space after possible
+ // subsampling.
+ if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_threads,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+ 2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+ nullptr, nullptr, nullptr)) {
+ LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
+ // We can parse the current frame if all the reference frames have been
+ // parsed.
+ for (const int index : frame_header.reference_frame_index) {
+ if (!state.reference_frame[index]->WaitUntilParsed()) {
+ return kStatusUnknownError;
+ }
+ }
+ }
+
+ // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+ // a segmentation map containing all 0s.
+ const SegmentationMap* prev_segment_ids = nullptr;
+ if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+ frame_scratch_buffer->symbol_decoder_context.Initialize(
+ frame_header.quantizer.base_index);
+ } else {
+ const int index =
+ frame_header
+ .reference_frame_index[frame_header.primary_reference_frame];
+ assert(index != -1);
+ const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+ frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+ if (frame_header.segmentation.enabled &&
+ prev_frame->columns4x4() == frame_header.columns4x4 &&
+ prev_frame->rows4x4() == frame_header.rows4x4) {
+ prev_segment_ids = prev_frame->segmentation_map();
+ }
+ }
+
+ // The Tile class must make use of a separate buffer to store the unfiltered
+ // pixels for the intra prediction of the next superblock row. This is done
+ // only when one of the following conditions are true:
+ // * is_frame_parallel_ is true.
+ // * settings_.threads == 1.
+ // In the non-frame-parallel multi-threaded case, we do not run the post
+ // filters in the decode loop. So this buffer need not be used.
+ const bool use_intra_prediction_buffer =
+ is_frame_parallel_ || settings_.threads == 1;
+ if (use_intra_prediction_buffer) {
+ if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+ frame_header.tile_info.tile_rows)) {
+ LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+ return kStatusOutOfMemory;
+ }
+ IntraPredictionBuffer* const intra_prediction_buffers =
+ frame_scratch_buffer->intra_prediction_buffers.get();
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ const int subsampling =
+ (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+ const size_t intra_prediction_buffer_size =
+ ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t)));
+ for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+ ++tile_row) {
+ if (!intra_prediction_buffers[tile_row][plane].Resize(
+ intra_prediction_buffer_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate intra prediction buffer for tile "
+ "row %d plane %d.\n",
+ tile_row, plane);
+ return kStatusOutOfMemory;
+ }
+ }
+ }
+ }
+
+ PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+ current_frame->buffer(), dsp,
+ settings_.post_filter_mask);
+ SymbolDecoderContext saved_symbol_decoder_context;
+ BlockingCounterWithStatus pending_tiles(tile_count);
+ for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+ std::unique_ptr<Tile> tile = Tile::Create(
+ tile_number, tile_buffers[tile_number].data,
+ tile_buffers[tile_number].size, sequence_header, frame_header,
+ current_frame, state, frame_scratch_buffer, wedge_masks_,
+ quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+ &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+ &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
+ if (tile == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+ return kStatusOutOfMemory;
+ }
+ tiles.push_back_unchecked(std::move(tile));
+ }
+ assert(tiles.size() == static_cast<size_t>(tile_count));
+ if (is_frame_parallel_) {
+ if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+ return DecodeTilesFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ return DecodeTilesThreadedFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ StatusCode status;
+ if (settings_.threads == 1) {
+ status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+ frame_scratch_buffer, &post_filter);
+ } else {
+ status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+ &post_filter, &pending_tiles);
+ }
+ if (status != kStatusOk) return status;
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::ApplyFilmGrain(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+ if (!sequence_header.film_grain_params_present ||
+ !displayable_frame->film_grain_params().apply_grain ||
+ (settings_.post_filter_mask & 0x10) == 0) {
+ *film_grain_frame = displayable_frame;
+ return kStatusOk;
+ }
+ if (!frame_header.show_existing_frame &&
+ frame_header.refresh_frame_flags == 0) {
+ // If show_existing_frame is true, then the current frame is a previously
+ // saved reference frame. If refresh_frame_flags is nonzero, then the
+ // state_.UpdateReferenceFrames() call above has saved the current frame as
+ // a reference frame. Therefore, if both of these conditions are false, then
+ // the current frame is not saved as a reference frame. displayable_frame
+ // should hold the only reference to the current frame.
+ assert(displayable_frame.use_count() == 1);
+ // Add film grain noise in place.
+ *film_grain_frame = displayable_frame;
+ } else {
+ *film_grain_frame = buffer_pool_.GetFreeBuffer();
+ if (*film_grain_frame == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "Could not get film_grain_frame from the buffer pool.");
+ return kStatusResourceExhausted;
+ }
+ if (!(*film_grain_frame)
+ ->Realloc(displayable_frame->buffer()->bitdepth(),
+ displayable_frame->buffer()->is_monochrome(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(),
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+ LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+ return kStatusOutOfMemory;
+ }
+ (*film_grain_frame)
+ ->set_chroma_sample_position(
+ displayable_frame->chroma_sample_position());
+ (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+ (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+ }
+ const bool color_matrix_is_identity =
+ sequence_header.color_config.matrix_coefficients ==
+ kMatrixCoefficientsIdentity;
+ assert(displayable_frame->buffer()->stride(kPlaneU) ==
+ displayable_frame->buffer()->stride(kPlaneV));
+ const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+ assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+ (*film_grain_frame)->buffer()->stride(kPlaneV));
+ const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (displayable_frame->buffer()->bitdepth() == 10) {
+ FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ if (displayable_frame->buffer()->bitdepth() == 12) {
+ FilmGrain<12> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+ FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+ if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+ [](const ObuHeader& obu_header) {
+ return obu_header.type == kObuSequenceHeader;
+ }) == obu.obu_headers().end()) {
+ return false;
+ }
+ const ObuSequenceHeader sequence_header = obu.sequence_header();
+ const bool sequence_header_changed =
+ !has_sequence_header_ ||
+ sequence_header_.color_config.bitdepth !=
+ sequence_header.color_config.bitdepth ||
+ sequence_header_.color_config.is_monochrome !=
+ sequence_header.color_config.is_monochrome ||
+ sequence_header_.color_config.subsampling_x !=
+ sequence_header.color_config.subsampling_x ||
+ sequence_header_.color_config.subsampling_y !=
+ sequence_header.color_config.subsampling_y ||
+ sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+ sequence_header_.max_frame_height != sequence_header.max_frame_height;
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ return sequence_header_changed;
+}
+
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+ if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+ return true;
+ }
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ return false;
+ }
+ wedge_masks_initialized_ = true;
+ return true;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+ const ObuFrameHeader& frame_header) {
+ if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+ return true;
+ }
+ if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+ return false;
+ }
+ quantizer_matrix_initialized_ = true;
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_IMPL_H_
+#define LIBGAV1_SRC_DECODER_IMPL_H_
+
+#include <array>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/constants.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
+#include "src/obu_parser.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct TemporalUnit;
+
+struct EncodedFrame {
+ EncodedFrame(ObuParser* const obu, const DecoderState& state,
+ const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+ : sequence_header(obu->sequence_header()),
+ frame_header(obu->frame_header()),
+ state(state),
+ temporal_unit(nullptr),
+ frame(frame),
+ position_in_temporal_unit(position_in_temporal_unit) {
+ obu->MoveTileBuffers(&tile_buffers);
+ frame->MarkFrameAsStarted();
+ }
+
+ const ObuSequenceHeader sequence_header;
+ const ObuFrameHeader frame_header;
+ Vector<TileBuffer> tile_buffers;
+ DecoderState state;
+ TemporalUnit* temporal_unit;
+ RefCountedBufferPtr frame;
+ const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+ // The default constructor is invoked by the Queue<TemporalUnit>::Init()
+ // method. Queue<> does not use the default-constructed elements, so it is
+ // safe for the default constructor to not initialize the members.
+ TemporalUnit() = default;
+ TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+ void* buffer_private_data)
+ : data(data),
+ size(size),
+ user_private_data(user_private_data),
+ buffer_private_data(buffer_private_data),
+ decoded(false),
+ status(kStatusOk),
+ has_displayable_frame(false),
+ output_frame_position(-1),
+ decoded_count(0),
+ output_layer_count(0),
+ released_input_buffer(false) {}
+
+ const uint8_t* data;
+ size_t size;
+ int64_t user_private_data;
+ void* buffer_private_data;
+
+ // The following members are used only in frame parallel mode.
+ bool decoded;
+ StatusCode status;
+ bool has_displayable_frame;
+ int output_frame_position;
+
+ Vector<EncodedFrame> frames;
+ size_t decoded_count;
+
+ // The struct (and the counter) is used to support output of multiple layers
+ // within a single temporal unit. The decoding process will store the output
+ // frames in |output_layers| in the order they are finished decoding. At the
+ // end of the decoding process, this array will be sorted in reverse order of
+ // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+ // reverse order (so that the entire process can run with a single counter
+ // variable).
+ struct OutputLayer {
+ // Used by std::sort to sort |output_layers| in reverse order of
+ // |position_in_temporal_unit|.
+ bool operator<(const OutputLayer& rhs) const {
+ return position_in_temporal_unit > rhs.position_in_temporal_unit;
+ }
+
+ RefCountedBufferPtr frame;
+ int position_in_temporal_unit = 0;
+ } output_layers[kMaxLayers];
+ // Number of entries in |output_layers|.
+ int output_layer_count;
+ // Flag to ensure that we release the input buffer only once if there are
+ // multiple output layers.
+ bool released_input_buffer;
+};
+
+class DecoderImpl : public Allocable {
+ public:
+ // The constructor saves a const reference to |*settings|. Therefore
+ // |*settings| must outlive the DecoderImpl object. On success, |*output|
+ // contains a pointer to the newly-created DecoderImpl object. On failure,
+ // |*output| is not modified.
+ static StatusCode Create(const DecoderSettings* settings,
+ std::unique_ptr<DecoderImpl>* output);
+ ~DecoderImpl();
+ StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+ StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+ static constexpr int GetMaxBitdepth() {
+ static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 ||
+ LIBGAV1_MAX_BITDEPTH == 12,
+ "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.");
+ return LIBGAV1_MAX_BITDEPTH;
+ }
+
+ private:
+ explicit DecoderImpl(const DecoderSettings* settings);
+ StatusCode Init();
+ // Called when the first frame is enqueued. It does the OBU parsing for one
+ // temporal unit to retrieve the tile configuration and sets up the frame
+ // threading if frame parallel mode is allowed. It also initializes the
+ // |temporal_units_| queue based on the number of frame threads.
+ //
+ // The following are the limitations of the current implementation:
+ // * It assumes that all frames in the video have the same tile
+ // configuration. The frame parallel threading model will not be updated
+ // based on tile configuration changes mid-stream.
+ // * The above assumption holds true even when there is a new coded video
+ // sequence (i.e.) a new sequence header.
+ StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+ size_t size);
+ // Used only in frame parallel mode. Signals failure and waits until the
+ // worker threads are aborted if |status| is a failure status. If |status| is
+ // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+ // Always returns the input parameter |status| as the return value.
+ //
+ // This function is called only from the application thread (from
+ // EnqueueFrame() and DequeueFrame()).
+ StatusCode SignalFailure(StatusCode status);
+
+ void ReleaseOutputFrame();
+
+ // Decodes all the frames contained in the given temporal unit. Used only in
+ // non frame parallel mode.
+ StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr);
+ // Used only in frame parallel mode. Does the OBU parsing for |data| and
+ // schedules the individual frames for decoding in the |frame_thread_pool_|.
+ StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data);
+ // Decodes the |encoded_frame| and updates the
+ // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+ // displayable frame. Used only in frame parallel mode.
+ StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+ // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+ // in |output_frame_|.
+ StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
+ StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state,
+ FrameScratchBuffer* frame_scratch_buffer,
+ RefCountedBuffer* current_frame);
+ // Applies film grain synthesis to the |displayable_frame| and stores the film
+ // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+ StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame,
+ ThreadPool* thread_pool);
+
+ bool IsNewSequenceHeader(const ObuParser& obu);
+
+ bool HasFailure() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return failure_status_ != kStatusOk;
+ }
+
+ // Initializes the |quantizer_matrix_| if necessary and sets
+ // |quantizer_matrix_initialized_| to true.
+ bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+ // Allocates and generates the |wedge_masks_| if necessary and sets
+ // |wedge_masks_initialized_| to true.
+ bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
+ // Elements in this queue cannot be moved with std::move since the
+ // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+ Queue<TemporalUnit> temporal_units_;
+ DecoderState state_;
+
+ DecoderBuffer buffer_ = {};
+ // |output_frame_| holds a reference to the output frame on behalf of
+ // |buffer_|.
+ RefCountedBufferPtr output_frame_;
+
+ // Queue of output frames that are to be returned in the DequeueFrame() calls.
+ // If |settings_.output_all_layers| is false, this queue will never contain
+ // more than 1 element. This queue is used only when |is_frame_parallel_| is
+ // false.
+ Queue<RefCountedBufferPtr> output_frame_queue_;
+
+ BufferPool buffer_pool_;
+ WedgeMaskArray wedge_masks_;
+ bool wedge_masks_initialized_ = false;
+ QuantizerMatrix quantizer_matrix_;
+ bool quantizer_matrix_initialized_ = false;
+ FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+ // Used to synchronize the accesses into |temporal_units_| in order to update
+ // the "decoded" state of an temporal unit.
+ std::mutex mutex_;
+ std::condition_variable decoded_condvar_;
+ bool is_frame_parallel_;
+ std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+ // In frame parallel mode, there are two primary points of failure:
+ // 1) ParseAndSchedule()
+ // 2) DecodeTiles()
+ // Both of these functions have to respond to the other one failing by
+ // aborting whatever they are doing. This variable is used to accomplish that.
+ // If |failure_status_| is not kStatusOk, then the two functions will try to
+ // abort as early as they can.
+ StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+ ObuSequenceHeader sequence_header_ = {};
+ // If true, sequence_header is valid.
+ bool has_sequence_header_ = false;
+
+ const DecoderSettings& settings_;
+ bool seen_first_frame_ = false;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DECODER_IMPL_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+ settings->threads = 1;
+ settings->frame_parallel = 0; // false
+ settings->blocking_dequeue = 0; // false
+ settings->on_frame_buffer_size_changed = nullptr;
+ settings->get_frame_buffer = nullptr;
+ settings->release_frame_buffer = nullptr;
+ settings->release_input_buffer = nullptr;
+ settings->callback_private_data = nullptr;
+ settings->output_all_layers = 0; // false
+ settings->operating_point = 0;
+ settings->post_filter_mask = 0x1f;
+}
+
+} // extern "C"
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+ // Section 7.20. Updates frames in the reference_frame array with
+ // |current_frame|, based on the |refresh_frame_flags| bitmask.
+ void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+ int refresh_frame_flags) {
+ for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+ ++ref_index, mask >>= 1) {
+ if ((mask & 1) != 0) {
+ reference_frame_id[ref_index] = current_frame_id;
+ reference_frame[ref_index] = current_frame;
+ reference_order_hint[ref_index] = order_hint;
+ }
+ }
+ }
+
+ // Clears all the reference frames.
+ void ClearReferenceFrames() {
+ reference_frame_id = {};
+ reference_order_hint = {};
+ for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+ reference_frame[ref_index] = nullptr;
+ }
+ }
+
+ // reference_frame_id and current_frame_id have meaningful values and are used
+ // in checks only if sequence_header_.frame_id_numbers_present is true. If
+ // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+ // current_frame_id are assigned the default value 0 and are not used in
+ // checks.
+ std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+ // A valid value of current_frame_id is an unsigned integer of at most 16
+ // bits. -1 indicates current_frame_id is not initialized.
+ int current_frame_id = -1;
+ // The RefOrderHint array variable in the spec.
+ std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+ // The OrderHint variable in the spec. Its value comes from either the
+ // order_hint syntax element in the uncompressed header (if
+ // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+ // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+ // 5.9.2 and Section 7.4.
+ //
+ // NOTE: When show_existing_frame is false, it is often more convenient to
+ // just use the order_hint field of the frame header as OrderHint. So this
+ // field is mainly used to update the reference_order_hint array in
+ // UpdateReferenceFrames().
+ uint8_t order_hint = 0;
+ // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+ // of the motion vector in time for each reference frame.
+ // * |false| indicates that the reference frame is a forwards reference (i.e.
+ // the reference frame is expected to be output before the current frame);
+ // * |true| indicates that the reference frame is a backwards reference.
+ // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+ std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+ // The RefValid[i] variable in the spec does not need to be stored explicitly.
+ // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+ // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+ // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+ // spec is 1, then reference_frame[i] contains a frame buffer pointer.
+ std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DECODER_STATE_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "gtest/gtest.h"
+#include "src/decoder_test_data.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+ OBU_FRAME_1};
+
+constexpr uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+constexpr uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+ OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+constexpr uint8_t kFrame2WithItutT35[] = {OBU_TEMPORAL_DELIMITER,
+ OBU_METADATA_ITUT_T35, OBU_FRAME_2};
+
+class DecoderTest : public testing::Test {
+ public:
+ void SetUp() override;
+ void IncrementFramesInUse() { ++frames_in_use_; }
+ void DecrementFramesInUse() { --frames_in_use_; }
+ void SetBufferPrivateData(void* buffer_private_data) {
+ buffer_private_data_ = buffer_private_data;
+ }
+ void SetReleasedInputBuffer(void* released_input_buffer) {
+ released_input_buffer_ = released_input_buffer;
+ }
+
+ protected:
+ std::unique_ptr<Decoder> decoder_;
+ int frames_in_use_ = 0;
+ void* buffer_private_data_ = nullptr;
+ void* released_input_buffer_ = nullptr;
+};
+
+struct FrameBufferPrivate {
+ uint8_t* data[3];
+};
+
+extern "C" {
+
+static Libgav1StatusCode GetFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ Libgav1FrameBufferInfo info;
+ Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, &info);
+ if (status != kLibgav1StatusOk) return status;
+
+ std::unique_ptr<FrameBufferPrivate> buffer_private(new (std::nothrow)
+ FrameBufferPrivate);
+ if (buffer_private == nullptr) return kLibgav1StatusOutOfMemory;
+
+ for (int i = 0; i < 3; ++i) {
+ const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+ buffer_private->data[i] = new (std::nothrow) uint8_t[size];
+ if (buffer_private->data[i] == nullptr) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ }
+
+ uint8_t* const y_buffer = buffer_private->data[0];
+ uint8_t* const u_buffer =
+ (info.uv_buffer_size != 0) ? buffer_private->data[1] : nullptr;
+ uint8_t* const v_buffer =
+ (info.uv_buffer_size != 0) ? buffer_private->data[2] : nullptr;
+
+ status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+ buffer_private.release(), frame_buffer);
+ if (status != kLibgav1StatusOk) return status;
+
+ auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+ decoder_test->IncrementFramesInUse();
+ decoder_test->SetBufferPrivateData(frame_buffer->private_data);
+ return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_private = static_cast<FrameBufferPrivate*>(buffer_private_data);
+ for (auto& data : buffer_private->data) {
+ delete[] data;
+ }
+ delete buffer_private;
+ auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+ decoder_test->DecrementFramesInUse();
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+ auto* const decoder_test = static_cast<DecoderTest*>(private_data);
+ decoder_test->SetReleasedInputBuffer(input_buffer);
+}
+
+} // extern "C"
+
+void DecoderTest::SetUp() {
+ decoder_.reset(new (std::nothrow) Decoder());
+ ASSERT_NE(decoder_, nullptr);
+ DecoderSettings settings = {};
+ settings.frame_parallel = false;
+ settings.get_frame_buffer = GetFrameBuffer;
+ settings.release_frame_buffer = ReleaseFrameBuffer;
+ settings.callback_private_data = this;
+ settings.release_input_buffer = ReleaseInputBuffer;
+ ASSERT_EQ(decoder_->Init(&settings), kStatusOk);
+}
+
+TEST_F(DecoderTest, APIFlowForNonFrameParallelMode) {
+ StatusCode status;
+ const DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+ const_cast<uint8_t*>(kFrame1));
+ ASSERT_EQ(status, kStatusOk);
+
+ // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+ // So there should be no frames in use yet.
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+ // libgav1 has decoded frame1 and is holding a reference to it.
+ EXPECT_EQ(frames_in_use_, 1);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+ const_cast<uint8_t*>(kFrame2));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Dequeue the output of frame2.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+ EXPECT_EQ(frames_in_use_, 2);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Signal end of stream (method 1). This should ensure that all the references
+ // are released.
+ status = decoder_->SignalEOS();
+ EXPECT_EQ(status, kStatusOk);
+
+ // libgav1 should have released all the reference frames now.
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Now, the decoder is ready to accept a new coded video sequence.
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+ const_cast<uint8_t*>(kFrame1));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+ EXPECT_EQ(frames_in_use_, 1);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+ const_cast<uint8_t*>(kFrame2));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Dequeue the output of frame2.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+ EXPECT_EQ(frames_in_use_, 2);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Signal end of stream (method 2). This should ensure that all the references
+ // are released.
+ decoder_ = nullptr;
+
+ // libgav1 should have released all the frames now.
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing) {
+ StatusCode status;
+ const DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+ const_cast<uint8_t*>(kFrame1));
+ ASSERT_EQ(status, kStatusOk);
+
+ // Until the output of frame1 is dequeued, no other frames can be enqueued.
+ status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+ const_cast<uint8_t*>(kFrame2));
+ ASSERT_EQ(status, kStatusTryAgain);
+
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Delete the decoder instance.
+ decoder_ = nullptr;
+
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEOSBeforeDequeuingLastFrame) {
+ StatusCode status;
+ const DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+ const_cast<uint8_t*>(kFrame1));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+ // Enqueue frame2 for decoding.
+ status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+ const_cast<uint8_t*>(kFrame2));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Signal end of stream before dequeuing the output of frame2.
+ status = decoder_->SignalEOS();
+ ASSERT_EQ(status, kStatusOk);
+
+ // In this case, the output of the last frame that was enqueued is lost (which
+ // is intentional since end of stream was signaled without dequeueing it).
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeInvalidFrameAfterEOS) {
+ StatusCode status;
+ const DecoderBuffer* buffer = nullptr;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+ const_cast<uint8_t*>(kFrame1));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Signal end of stream.
+ status = decoder_->SignalEOS();
+ EXPECT_EQ(status, kStatusOk);
+
+ // libgav1 should have released all the reference frames now.
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Now, the decoder is ready to accept a new coded video sequence. But, we
+ // try to enqueue a frame that does not have a sequence header (which is not
+ // allowed).
+
+ // Enqueue frame2 for decoding.
+ status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+ const_cast<uint8_t*>(kFrame2));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 0);
+
+ // Dequeue the output of frame2 (this will fail since no sequence header has
+ // been seen since the last EOS signal).
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusBitstreamError);
+ EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, MetadataObu) {
+ StatusCode status;
+ const DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(
+ kFrame1WithHdrCllAndHdrMdcv, sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+ const_cast<uint8_t*>(kFrame1WithHdrCllAndHdrMdcv));
+ ASSERT_EQ(status, kStatusOk);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(buffer->has_hdr_cll, 1);
+ EXPECT_EQ(buffer->has_hdr_mdcv, 1);
+ EXPECT_EQ(buffer->has_itut_t35, 0);
+ EXPECT_EQ(released_input_buffer_, &kFrame1WithHdrCllAndHdrMdcv);
+
+ // libgav1 has decoded frame1 and is holding a reference to it.
+ EXPECT_EQ(frames_in_use_, 1);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status =
+ decoder_->EnqueueFrame(kFrame2WithItutT35, sizeof(kFrame2WithItutT35), 0,
+ const_cast<uint8_t*>(kFrame2WithItutT35));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Dequeue the output of frame2.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(buffer->has_hdr_cll, 0);
+ EXPECT_EQ(buffer->has_hdr_mdcv, 0);
+ EXPECT_EQ(buffer->has_itut_t35, 1);
+ EXPECT_NE(buffer->itut_t35.payload_bytes, nullptr);
+ EXPECT_GT(buffer->itut_t35.payload_size, 0);
+ EXPECT_EQ(released_input_buffer_, &kFrame2WithItutT35);
+
+ EXPECT_EQ(frames_in_use_, 2);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ status = decoder_->SignalEOS();
+ EXPECT_EQ(status, kStatusOk);
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2022 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_TEST_DATA_H_
+#define LIBGAV1_SRC_DECODER_TEST_DATA_H_
+
+// The bytes for these two frames come from the libaom test vector
+// av1-1-b8-01-size-32x32.ivf
+#define OBU_TEMPORAL_DELIMITER 0x12, 0x0
+#define OBU_SEQUENCE_HEADER \
+ 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, 0xc0, 0x20
+#define OBU_FRAME_1 \
+ 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x10, 0x30, \
+ 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, 0xf2, 0xa4, 0xa4, \
+ 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, 0xa8, 0x6f, 0x8d, \
+ 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, 0x8b, 0x0, 0xff, \
+ 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, 0xc2, 0xc6, 0x6e, \
+ 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, 0xeb, 0xbb, 0x4f, \
+ 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, 0x5e, 0x1b, 0x65, \
+ 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, 0xd9, 0x8e, 0x9c, \
+ 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, 0x62, 0x69, 0xd, \
+ 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, 0xa3, 0xe1, 0x42, \
+ 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, 0x75, 0xe9, 0xe3, \
+ 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, 0x82, 0x87, 0x71, \
+ 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, 0xcd, 0xe7, 0x12, \
+ 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, 0x45, 0x36, 0x52, \
+ 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, 0x26, 0x46, 0x1b, \
+ 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, 0xf4, 0x62, 0xf4, \
+ 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, 0xe1, 0xd6, 0x88, \
+ 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, 0x85, 0x8e, 0xa2, \
+ 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, 0xc4, 0x8e, 0x9e, \
+ 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, 0x57, 0x86, 0xcb, \
+ 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, 0xeb, 0x91, 0xb3, \
+ 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, 0x56, 0x75, 0xb3, \
+ 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, 0xaf, 0xd6, 0x6b, \
+ 0x38
+#define OBU_FRAME_2 \
+ 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, 0xa8, 0x80, 0x0, 0x3, \
+ 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, 0xb1, 0x51, 0x15, 0x58, 0xc7, \
+ 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, \
+ 0xa6, 0x11, 0x7, 0x49, 0x76, 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, \
+ 0x98, 0x17, 0x3d, 0x80
+#define OBU_METADATA_HDR_CLL 0x2a, 0x06, 0x01, 0x27, 0x10, 0x0d, 0xdf, 0x80
+#define OBU_METADATA_HDR_MDCV \
+ 0x2a, 0x1a, 0x02, 0xae, 0x14, 0x51, 0xec, 0x43, 0xd7, 0xb0, 0xa4, 0x26, \
+ 0x66, 0x0f, 0x5c, 0x50, 0x0d, 0x54, 0x39, 0x00, 0x0f, 0xa0, 0x00, 0x00, \
+ 0x00, 0x00, 0x52, 0x80
+#define OBU_METADATA_ITUT_T35 \
+ 0x2a, 0xf, 0x04, 0xa6, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, \
+ 0x00, 0x80, 0x00, 0x00
+
+#endif // LIBGAV1_SRC_DECODER_TEST_DATA_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline uint8x8_t AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT
+ prediction_1) {
+ const int16x8_t pred0 = vld1q_s16(prediction_0);
+ const int16x8_t pred1 = vld1q_s16(prediction_1);
+ const int16x8_t res = vaddq_s16(pred0, pred1);
+ return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ const int width,
+ uint8_t* LIBGAV1_RESTRICT dest) {
+ int x = width;
+ do {
+ const int16x8_t pred_00 = vld1q_s16(prediction_0);
+ const int16x8_t pred_01 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
+ const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
+ const int16x8_t pred_10 = vld1q_s16(prediction_0);
+ const int16x8_t pred_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
+ const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
+ vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+ dest += 16;
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ do {
+ const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ StoreLo4(dst, result);
+ dst += dest_stride;
+ StoreHi4(dst, result);
+ dst += dest_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+ dst += dest_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+ dst += dest_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(
+ const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ const int32x4_t compound_offset, const uint16x8_t v_bitdepth) {
+ const uint16x8_t pred0 = vld1q_u16(prediction_0);
+ const uint16x8_t pred1 = vld1q_u16(prediction_1);
+ const uint32x4_t pred_lo =
+ vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+ const uint32x4_t pred_hi =
+ vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+ const int32x4_t offset_lo =
+ vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+ const int32x4_t offset_hi =
+ vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+ const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+ const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+ return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ const int width,
+ uint16_t* LIBGAV1_RESTRICT dest,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ int x = width;
+ do {
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = height;
+
+ const ptrdiff_t dst_stride = dest_stride >> 1;
+ const int32x4_t compound_offset =
+ vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+ const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ if (width == 4) {
+ do {
+ const uint16x8_t result =
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u16(dst, vget_low_u16(result));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(result));
+ dst += dst_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ // 00 01 02 03 04 05 06 07
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+ // 17 00 00 00 00 00 00 00
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ uint8x16_t v_d1_temp[8];
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = vdupq_n_u16(0);
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint16x8_t v_zero = vdupq_n_u16(0);
+ uint16x8_t v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+ v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+ v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+ v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = vdupq_n_u16(0);
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride, uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const auto* src = static_cast<const uint8_t*>(source);
+
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ uint8x8_t v_src[8];
+ if (bitdepth == kBitdepth8) {
+ for (auto& v : v_src) {
+ v = vld1_u8(src);
+ src += stride;
+ }
+ } else {
+ // bitdepth - 8
+ constexpr int src_shift = (bitdepth == kBitdepth10) ? 2 : 4;
+ for (auto& v : v_src) {
+ v = vshrn_n_u16(vld1q_u16(reinterpret_cast<const uint16_t*>(src)),
+ src_shift);
+ src += stride;
+ }
+ }
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+ for (int i = 2; i < 8; ++i) {
+ partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+ uint8x8_t v_src_reverse[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = vrev64_u8(v_src[i]);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+ return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+ const uint32x4_t division_table[4]) {
+ uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+ c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+ return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+ uint32x4_t c = Square(vget_low_u16(a));
+ c = SquareAccumulate(c, vget_high_u16(a));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+ return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+ const uint32x4_t division_table[2]) {
+ // Remove elements 0-2.
+ uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+ c = vaddq_u32(c, Square(vget_high_u16(a)));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+ c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+ return SumVector(c);
+}
+
+template <int bitdepth>
+void CdefDirection_NEON(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+
+ uint32_t cost[8];
+ uint16x8_t partial_lo[8], partial_hi[8];
+
+ AddPartial<bitdepth>(src, stride, partial_lo, partial_hi);
+
+ cost[2] = SquareAccumulate(partial_lo[2]);
+ cost[6] = SquareAccumulate(partial_lo[6]);
+
+ const uint32x4_t division_table[4] = {
+ vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+ vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const uint32x4_t division_table_odd[2] = {
+ vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+ const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+ cost[1] =
+ CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+ cost[3] =
+ CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+ cost[5] =
+ CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+ cost[7] =
+ CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, uint16x8_t* output,
+ const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vld1q_u16(src + y_0 * stride + x_0);
+ output[1] = vld1q_u16(src - y_0 * stride - x_0);
+ output[2] = vld1q_u16(src + y_1 * stride + x_1);
+ output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, uint16x8_t* output,
+ const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+ vld1_u16(src + y_0 * stride + stride + x_0));
+ output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+ vld1_u16(src - y_0 * stride + stride - x_0));
+ output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+ vld1_u16(src + y_1 * stride + stride + x_1));
+ output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+ vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+ const uint16x8_t threshold, const int16x8_t damping) {
+ // If reference > pixel, the difference will be negative, so convert to 0 or
+ // -1.
+ const uint16x8_t sign = vcgtq_u16(reference, pixel);
+ const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+ const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const uint16x8_t thresh_minus_shifted_diff =
+ vqsubq_u16(threshold, shifted_diff);
+ const uint16x8_t clamp_abs_diff =
+ vminq_u16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return vreinterpretq_s16_u16(
+ vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxPrimary(uint16x8_t* primary_val, uint16x8_t max,
+ uint16x8_t cdef_large_value_mask) {
+ if (sizeof(Pixel) == 1) {
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const uint8x16_t max_p01 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+ vreinterpretq_u8_u16(primary_val[1]));
+ const uint8x16_t max_p23 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+ vreinterpretq_u8_u16(primary_val[3]));
+ const uint16x8_t max_p = vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+ max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+ } else {
+ // Convert kCdefLargeValue to 0 before calculating max.
+ max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
+ }
+ return max;
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxSecondary(uint16x8_t* secondary_val, uint16x8_t max,
+ uint16x8_t cdef_large_value_mask) {
+ if (sizeof(Pixel) == 1) {
+ const uint8x16_t max_s01 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+ vreinterpretq_u8_u16(secondary_val[1]));
+ const uint8x16_t max_s23 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+ vreinterpretq_u8_u16(secondary_val[3]));
+ const uint8x16_t max_s45 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+ vreinterpretq_u8_u16(secondary_val[5]));
+ const uint8x16_t max_s67 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+ vreinterpretq_u8_u16(secondary_val[7]));
+ const uint16x8_t max_s = vreinterpretq_u16_u8(
+ vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+ max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+ } else {
+ max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
+ max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
+ }
+ return max;
+}
+
+template <typename Pixel, int width>
+void StorePixels(void* dest, ptrdiff_t dst_stride, int16x8_t result) {
+ auto* const dst8 = static_cast<uint8_t*>(dest);
+ if (sizeof(Pixel) == 1) {
+ const uint8x8_t dst_pixel = vqmovun_s16(result);
+ if (width == 8) {
+ vst1_u8(dst8, dst_pixel);
+ } else {
+ StoreLo4(dst8, dst_pixel);
+ StoreHi4(dst8 + dst_stride, dst_pixel);
+ }
+ } else {
+ const uint16x8_t dst_pixel = vreinterpretq_u16_s16(result);
+ auto* const dst16 = reinterpret_cast<uint16_t*>(dst8);
+ if (width == 8) {
+ vst1q_u16(dst16, dst_pixel);
+ } else {
+ auto* const dst16_next_row =
+ reinterpret_cast<uint16_t*>(dst8 + dst_stride);
+ vst1_u16(dst16, vget_low_u16(dst_pixel));
+ vst1_u16(dst16_next_row, vget_high_u16(dst_pixel));
+ }
+ }
+}
+
+template <int width, typename Pixel, bool enable_primary = true,
+ bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+ const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+ const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+ int16x8_t primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ // 10-bit damping range: Y: [3, 6 + 2], UV: [2, 5 + 2].
+ if (enable_primary) {
+ // 8-bit primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is
+ // necessary for UV filtering.
+ // 10-bit primary_strength: [0, 15 << 2].
+ primary_damping_shift =
+ vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+ }
+
+ if (enable_secondary) {
+ if (sizeof(Pixel) == 1) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+ } else {
+ // secondary_strength: [0, 4 << 2]
+ secondary_damping_shift =
+ vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
+ }
+ }
+
+ constexpr int coeff_shift = (sizeof(Pixel) == 1) ? 0 : kBitdepth10 - 8;
+ const int primary_tap_0 =
+ kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][0];
+ const int primary_tap_1 =
+ kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][1];
+
+ int y = height;
+ do {
+ uint16x8_t pixel;
+ if (width == 8) {
+ pixel = vld1q_u16(src);
+ } else {
+ pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+ }
+
+ uint16x8_t min = pixel;
+ uint16x8_t max = pixel;
+ int16x8_t sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ uint16x8_t primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, primary_val[0]);
+ min = vminq_u16(min, primary_val[1]);
+ min = vminq_u16(min, primary_val[2]);
+ min = vminq_u16(min, primary_val[3]);
+
+ max = GetMaxPrimary<Pixel>(primary_val, max, cdef_large_value_mask);
+ }
+
+ sum = Constrain(primary_val[0], pixel, primary_threshold,
+ primary_damping_shift);
+ sum = vmulq_n_s16(sum, primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[1], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[2], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[3], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ } else {
+ sum = vdupq_n_s16(0);
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ uint16x8_t secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, secondary_val[0]);
+ min = vminq_u16(min, secondary_val[1]);
+ min = vminq_u16(min, secondary_val[2]);
+ min = vminq_u16(min, secondary_val[3]);
+ min = vminq_u16(min, secondary_val[4]);
+ min = vminq_u16(min, secondary_val[5]);
+ min = vminq_u16(min, secondary_val[6]);
+ min = vminq_u16(min, secondary_val[7]);
+
+ max = GetMaxSecondary<Pixel>(secondary_val, max, cdef_large_value_mask);
+ }
+
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[0], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[1], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[2], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[3], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[4], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[5], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[6], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[7], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+ sum = vaddq_s16(sum, sum_lt_0);
+ int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+ if (clipping_required) {
+ result = vminq_s16(result, vreinterpretq_s16_u16(max));
+ result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+ }
+
+ StorePixels<Pixel, width>(dst, dst_stride, result);
+
+ src += (width == 8) ? src_stride : src_stride << 1;
+ dst += (width == 8) ? dst_stride : dst_stride << 1;
+ y -= (width == 8) ? 1 : 2;
+ } while (y != 0);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_NEON<kBitdepth8>;
+ dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint8_t>;
+ dsp->cdef_filters[0][1] = CdefFilter_NEON<4, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_NEON<4, uint8_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint8_t>;
+ dsp->cdef_filters[1][1] = CdefFilter_NEON<8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_NEON<8, uint8_t, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_NEON<kBitdepth10>;
+ dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_NEON<4, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_NEON<4, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_NEON<8, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_NEON<8, uint16_t, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void CdefInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/utils/compiler_attributes.h"
+
+#if 0
+#include <cstdio>
+#include <string>
+
+constexpr bool kEnablePrintRegs = true;
+
+union DebugRegister {
+ int8_t i8[8];
+ int16_t i16[4];
+ int32_t i32[2];
+ uint8_t u8[8];
+ uint16_t u16[4];
+ uint32_t u32[2];
+};
+
+union DebugRegisterQ {
+ int8_t i8[16];
+ int16_t i16[8];
+ int32_t i32[4];
+ uint8_t u8[16];
+ uint16_t u16[8];
+ uint32_t u32[4];
+};
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintVect(const DebugRegister r, const char* const name, int size) {
+ int n;
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+// Debugging macro for 128-bit types.
+inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
+ int size) {
+ int n;
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+ DebugRegisterQ r;
+ vst1q_s32(r.i32, val.val[0]);
+ const std::string name0 = name + std::string(".val[0]");
+ PrintVectQ(r, name0.c_str(), 32);
+ vst1q_s32(r.i32, val.val[1]);
+ const std::string name1 = name + std::string(".val[1]");
+ PrintVectQ(r, name1.c_str(), 32);
+}
+
+inline void PrintReg(const uint32x4_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u32(r.u32, val);
+ PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const uint32x2_t val, const char* name) {
+ DebugRegister r;
+ vst1_u32(r.u32, val);
+ PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const uint16x8_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u16(r.u16, val);
+ PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const uint16x4_t val, const char* name) {
+ DebugRegister r;
+ vst1_u16(r.u16, val);
+ PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const uint8x16_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_u8(r.u8, val);
+ PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const uint8x8_t val, const char* name) {
+ DebugRegister r;
+ vst1_u8(r.u8, val);
+ PrintVect(r, name, 8);
+}
+
+inline void PrintReg(const int32x4_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s32(r.i32, val);
+ PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const int32x2_t val, const char* name) {
+ DebugRegister r;
+ vst1_s32(r.i32, val);
+ PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const int16x8_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s16(r.i16, val);
+ PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const int16x4_t val, const char* name) {
+ DebugRegister r;
+ vst1_s16(r.i16, val);
+ PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const int8x16_t val, const char* name) {
+ DebugRegisterQ r;
+ vst1q_s8(r.i8, val);
+ PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const int8x8_t val, const char* name) {
+ DebugRegister r;
+ vst1_s8(r.i8, val);
+ PrintVect(r, name, 8);
+}
+
+// Print an individual (non-vector) value in decimal format.
+inline void PrintReg(const int x, const char* name) {
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s: %d\n", name, x);
+ }
+}
+
+// Print an individual (non-vector) value in hexadecimal format.
+inline void PrintHex(const int x, const char* name) {
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "%s: %x\n", name, x);
+ }
+}
+
+#define PR(x) PrintReg(x, #x)
+#define PD(x) PrintReg(x, #x)
+#define PX(x) PrintHex(x, #x)
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+ const size_t size) {
+ if (kEnablePrintRegs) {
+ fprintf(stderr, "Shadow for %s:\n", name);
+ __msan_print_shadow(r, size);
+ }
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif // LIBGAV1_MSAN
+
+#endif // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+ const uint16x4_t zero = vdup_n_u16(0);
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(
+ vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+template <int lane>
+inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u16_u32(
+ vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+ const uint32x2_t zero = vdup_n_u32(0);
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u8_u32(
+ vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
+}
+
+// Convenience functions for 16-bit loads from a uint8_t* source.
+inline uint16x4_t Load4U16(const void* const buf) {
+ return vld1_u16(static_cast<const uint16_t*>(buf));
+}
+
+inline uint16x8_t Load8U16(const void* const buf) {
+ return vld1q_u16(static_cast<const uint16_t*>(buf));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline uint8x8_t MaskOverreads(const uint8x8_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ uint8x8_t dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ uint8x8_t mask = vdup_n_u8(0);
+ uint8x8_t valid_element_mask = vdup_n_u8(-1);
+ const int valid_bytes =
+ std::min(8, 8 - static_cast<int>(over_read_in_bytes));
+ for (int i = 0; i < valid_bytes; ++i) {
+ // Feed ff bytes into |mask| one at a time.
+ mask = vext_u8(valid_element_mask, mask, 7);
+ }
+ dst = vand_u8(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ uint8x16_t dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ uint8x16_t mask = vdupq_n_u8(0);
+ uint8x16_t valid_element_mask = vdupq_n_u8(-1);
+ const int valid_bytes =
+ std::min(16, 16 - static_cast<int>(over_read_in_bytes));
+ for (int i = 0; i < valid_bytes; ++i) {
+ // Feed ff bytes into |mask| one at a time.
+ mask = vextq_u8(valid_element_mask, mask, 15);
+ }
+ dst = vandq_u8(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u16_u8(
+ MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
+inline uint8x8_t Load1MsanU8(const uint8_t* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(vld1_u8(source), over_read_in_bytes);
+}
+
+inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
+}
+
+inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u16_u8(MaskOverreadsQ(
+ vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
+}
+
+inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u32_u8(MaskOverreadsQ(
+ vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+ memcpy(buf, &val, sizeof(val));
+}
+
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+ ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
+}
+
+// Store 4 uint8_t values from the low half of a uint8x8_t register.
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+}
+
+// Store 4 uint8_t values from the high half of a uint8x8_t register.
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x8_t val) {
+ ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x4_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+ vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+ vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+ // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+ // bytes of the destination as initialized. To avoid missing truly
+ // uninitialized memory, check the input vectors first, before marking the
+ // whole 64 bytes initialized. If any input vector contains unused values, it
+ // should pass through MaskOverreadsQ first.
+ __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+ __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+ __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+ __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+ __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif // LIBGAV1_MSAN
+}
+
+//------------------------------------------------------------------------------
+// Pointer helpers.
+
+// This function adds |stride|, given as a number of bytes, to a pointer to a
+// larger type, using native pointer arithmetic.
+template <typename T>
+inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
+ return reinterpret_cast<T*>(
+ const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
+}
+
+//------------------------------------------------------------------------------
+// Multiply.
+
+// Shim vmull_high_u16 for armv7.
+inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+ return vmull_high_u16(a, b);
+#else
+ return vmull_u16(vget_high_u16(a), vget_high_u16(b));
+#endif
+}
+
+// Shim vmull_high_s16 for armv7.
+inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
+#if defined(__aarch64__)
+ return vmull_high_s16(a, b);
+#else
+ return vmull_s16(vget_high_s16(a), vget_high_s16(b));
+#endif
+}
+
+// Shim vmlal_high_u16 for armv7.
+inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
+ const uint16x8_t c) {
+#if defined(__aarch64__)
+ return vmlal_high_u16(a, b, c);
+#else
+ return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
+#endif
+}
+
+// Shim vmlal_high_s16 for armv7.
+inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
+ const int16x8_t c) {
+#if defined(__aarch64__)
+ return vmlal_high_s16(a, b, c);
+#else
+ return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
+#endif
+}
+
+// Shim vmul_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+ return vmul_laneq_u16(a, b, lane);
+#else
+ if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
+ return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmulq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+ return vmulq_laneq_u16(a, b, lane);
+#else
+ if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
+ return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmla_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
+ const uint16x8_t c) {
+#if defined(__aarch64__)
+ return vmla_laneq_u16(a, b, c, lane);
+#else
+ if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+ return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmlaq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t c) {
+#if defined(__aarch64__)
+ return vmlaq_laneq_u16(a, b, c, lane);
+#else
+ if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+ return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Bit manipulation.
+
+// vshXX_n_XX() requires an immediate.
+template <int shift>
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
+ return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
+ return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
+}
+
+// Shim vqtbl1_u8 for armv7.
+inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl1_u8(a, index);
+#else
+ const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
+ return vtbl2_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2_u8 for armv7.
+inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl2_u8(a, index);
+#else
+ const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+ vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+ return vtbl4_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2q_u8 for armv7.
+inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+ return vqtbl2q_u8(a, index);
+#else
+ return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
+ VQTbl2U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl3_u8(a, index);
+#else
+ const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+ vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+ const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
+ const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
+ const uint8x8_t partial_lookup = vtbl4_u8(b, index);
+ return vtbx2_u8(partial_lookup, c, index_ext);
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+ return vqtbl3q_u8(a, index);
+#else
+ return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
+ VQTbl3U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl1_s8(a, index);
+#else
+ const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+ return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Saturation helpers.
+
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
+ return vmin_s16(vmax_s16(val, low), high);
+}
+
+inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
+ const int16x8_t high) {
+ return vminq_s16(vmaxq_s16(val, low), high);
+}
+
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
+ const int16x8_t low = vdupq_n_s16(0);
+ const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+//------------------------------------------------------------------------------
+// Interleave.
+
+// vzipN is exclusive to A64.
+inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vzip1_u8(a, b);
+#else
+ // Discard |.val[1]|
+ return vzip_u8(a, b).val[0];
+#endif
+}
+
+inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_u8_u32(
+ vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+ // Discard |.val[1]|
+ return vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
+#endif
+}
+
+inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_s8_u32(
+ vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+ // Discard |.val[1]|
+ return vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
+#endif
+}
+
+inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_u8_u32(
+ vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+ // Discard |.val[0]|
+ return vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
+#endif
+}
+
+inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+ return vreinterpret_s8_u32(
+ vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+ // Discard |.val[0]|
+ return vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t c = vpaddl_u8(a);
+ const uint32x2_t d = vpaddl_u16(c);
+ const uint64x1_t e = vpaddl_u32(d);
+ return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Transpose.
+
+// Transpose 32 bit elements such that:
+// a: 00 01
+// b: 02 03
+// returns
+// val[0]: 00 02
+// val[1]: 01 03
+inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
+ const uint32x2_t a_32 = vreinterpret_u32_u8(a);
+ const uint32x2_t b_32 = vreinterpret_u32_u8(b);
+ const uint32x2x2_t c = vtrn_u32(a_32, b_32);
+ const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
+ vreinterpret_u8_u32(c.val[1])};
+ return d;
+}
+
+// Swap high and low 32 bit elements.
+inline uint8x8_t Transpose32(const uint8x8_t a) {
+ const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
+ return vreinterpret_u8_u32(b);
+}
+
+// Swap high and low halves.
+inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
+
+// Implement vtrnq_s64().
+// Input:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+// Output:
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
+ int16x8x2_t b0;
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
+ uint16x8x2_t b0;
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+ return b0;
+}
+
+// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
+// a: 00 01 02 03 10 11 12 13
+// b: 20 21 22 23 30 31 32 33
+// Output:
+// Note that columns [1] and [2] are transposed.
+// a: 00 10 20 30 02 12 22 32
+// b: 01 11 21 31 03 13 23 33
+inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
+ const uint16x4x2_t c =
+ vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
+ const uint8x8x2_t e =
+ vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
+ *a = e.val[0];
+ *b = e.val[1];
+}
+
+// 4x8 Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 00 10 20 30 04 14 24 34
+// a[1]: 01 11 21 31 05 15 25 35
+// a[2]: 02 12 22 32 06 16 26 36
+// a[3]: 03 13 23 33 07 17 27 37
+inline void Transpose4x8(uint16x8_t a[4]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ a[0] = vreinterpretq_u16_u32(c0.val[0]);
+ a[1] = vreinterpretq_u16_u32(c1.val[0]);
+ a[2] = vreinterpretq_u16_u32(c0.val[1]);
+ a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q: p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34 p0q0
+// a[1]: 02 12 22 32 05 15 25 35 p1q1
+// a[2]: 01 11 21 31 06 16 26 36 p2q2
+// a[3]: 00 10 20 30 07 17 27 37 p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q: p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard Transpose4x8 will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // Reverse odd vectors to bring the appropriate items to the front of zips.
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // r0 : 03 13 01 11 07 17 05 15
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // r1 : 23 33 21 31 27 37 25 35
+ const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+ const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+ // Zip to complete the halves.
+ // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
+ // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
+ // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
+ // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
+ const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+ // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
+ // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
+ // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
+ // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
+ const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
+ // The third row of c comes first here to swap p2 with q0.
+ const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
+
+ // 8x4 Output:
+ // a[0]: 03 13 23 33 04 14 24 34 p0q0
+ // a[1]: 02 12 22 32 05 15 25 35 p1q1
+ // a[2]: 01 11 21 31 06 16 26 36 p2q2
+ // a[3]: 00 10 20 30 07 17 27 37 p3q3
+ a[0] = d1.val[0]; // p0q0
+ a[1] = d0.val[1]; // p1q1
+ a[2] = d1.val[1]; // p2q2
+ a[3] = d0.val[0]; // p3q3
+}
+
+// Reversible if the x4 values are packed next to each other.
+// x4 input / x8 output:
+// a0: 00 01 02 03 40 41 42 43 44
+// a1: 10 11 12 13 50 51 52 53 54
+// a2: 20 21 22 23 60 61 62 63 64
+// a3: 30 31 32 33 70 71 72 73 74
+// x8 input / x4 output:
+// a0: 00 10 20 30 40 50 60 70
+// a1: 01 11 21 31 41 51 61 71
+// a2: 02 12 22 32 42 52 62 72
+// a3: 03 13 23 33 43 53 63 73
+inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
+ uint8x8_t* a3) {
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int8x8_t a[8]) {
+ // Swap 8 bit elements. Goes from:
+ // a[0]: 00 01 02 03 04 05 06 07
+ // a[1]: 10 11 12 13 14 15 16 17
+ // a[2]: 20 21 22 23 24 25 26 27
+ // a[3]: 30 31 32 33 34 35 36 37
+ // a[4]: 40 41 42 43 44 45 46 47
+ // a[5]: 50 51 52 53 54 55 56 57
+ // a[6]: 60 61 62 63 64 65 66 67
+ // a[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+ const int8x16x2_t b0 =
+ vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
+ const int8x16x2_t b1 =
+ vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+ const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
+ vreinterpretq_s16_s8(b1.val[0]));
+ const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
+ vreinterpretq_s16_s8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
+ a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
+ a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
+ a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
+ a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
+ a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
+ a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
+ a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
+}
+
+// Unsigned.
+inline void Transpose8x8(uint8x8_t a[8]) {
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+ const uint8x16x2_t a0 =
+ vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+ const uint8x16x2_t a1 =
+ vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+ const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+ vreinterpretq_u16_u8(a1.val[0]));
+ const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+ vreinterpretq_u16_u8(a1.val[1]));
+
+ const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ out[0] = vreinterpretq_u8_u32(c0.val[0]);
+ out[1] = vreinterpretq_u8_u32(c1.val[0]);
+ out[2] = vreinterpretq_u8_u32(c0.val[1]);
+ out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int16x8_t a[8]) {
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+ a[0] = d0.val[0];
+ a[1] = d1.val[0];
+ a[2] = d2.val[0];
+ a[3] = d3.val[0];
+ a[4] = d0.val[1];
+ a[5] = d1.val[1];
+ a[6] = d2.val[1];
+ a[7] = d3.val[1];
+}
+
+// Unsigned.
+inline void Transpose8x8(uint16x8_t a[8]) {
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+ const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
+ const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
+
+ a[0] = d0.val[0];
+ a[1] = d1.val[0];
+ a[2] = d2.val[0];
+ a[3] = d3.val[0];
+ a[4] = d0.val[1];
+ a[5] = d1.val[1];
+ a[6] = d2.val[1];
+ a[7] = d3.val[1];
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27 a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37 b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47 c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57 d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67 e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77 f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ // b1.val[0]: 20 30 22 32 24 34 26 36 a0 b0 a2 b2 a4 b4 a6 b6
+ // b1.val[1]: 21 31 23 33 25 35 27 37 a1 b1 a3 b3 a5 b5 a7 b7
+ // b2.val[0]: 40 50 42 52 44 54 46 56 c0 d0 c2 d2 c4 d4 c6 d6
+ // b2.val[1]: 41 51 43 53 45 55 47 57 c1 d1 c3 d3 c5 d5 c7 d7
+ // b3.val[0]: 60 70 62 72 64 74 66 76 e0 f0 e2 f2 e4 f4 e6 f6
+ // b3.val[1]: 61 71 63 73 65 75 67 77 e1 f1 e3 f3 e5 f5 e7 f7
+ const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+ const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+ const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+ const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 a0 b0 84 94 a4 b4
+ // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 a2 b2 86 96 a6 b6
+ // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 a1 b1 85 95 a5 b5
+ // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 a3 b3 87 97 a7 b7
+ // c2.val[0]: 40 50 60 70 44 54 64 74 c0 d0 e0 f0 c4 d4 e4 f4
+ // c2.val[1]: 42 52 62 72 46 56 66 76 c2 d2 e2 f2 c6 d6 e6 f6
+ // c3.val[0]: 41 51 61 71 45 55 65 75 c1 d1 e1 f1 c5 d5 e5 f5
+ // c3.val[1]: 43 53 63 73 47 57 67 77 c3 d3 e3 f3 c7 d7 e7 f7
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+ // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+ // d1.val[0]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+ // d1.val[1]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+ // d2.val[0]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+ // d2.val[1]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+ // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+ // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ a[0] = vreinterpretq_u8_u32(d0.val[0]);
+ a[1] = vreinterpretq_u8_u32(d1.val[0]);
+ a[2] = vreinterpretq_u8_u32(d2.val[0]);
+ a[3] = vreinterpretq_u8_u32(d3.val[0]);
+ a[4] = vreinterpretq_u8_u32(d0.val[1]);
+ a[5] = vreinterpretq_u8_u32(d1.val[1]);
+ a[6] = vreinterpretq_u8_u32(d2.val[1]);
+ a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+ return vreinterpretq_s16_u16(vmovl_u8(in));
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_ENABLE_NEON
+#endif // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <cstdint>
+
+#include "tests/block_utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockWidth = 16;
+constexpr int kMaxBlockHeight = 16;
+
+template <typename Pixel>
+class TransposeTest : public testing::Test {
+ public:
+ TransposeTest() {
+ for (int y = 0; y < kMaxBlockHeight; ++y) {
+ for (int x = 0; x < kMaxBlockWidth; ++x) {
+ src_block_[y][x] = y * 16 + x;
+ expected_transpose_[y][x] = x * 16 + y;
+ }
+ }
+ }
+
+ TransposeTest(const TransposeTest&) = delete;
+ TransposeTest& operator=(const TransposeTest&) = delete;
+ ~TransposeTest() override = default;
+
+ protected:
+ Pixel src_block_[kMaxBlockHeight][kMaxBlockWidth];
+ Pixel expected_transpose_[kMaxBlockHeight][kMaxBlockWidth];
+};
+
+using TransposeTestLowBitdepth = TransposeTest<uint8_t>;
+
+TEST_F(TransposeTestLowBitdepth, Transpose4x4Test) {
+ uint8x8_t a = Load4<1>(src_block_[1], Load4(src_block_[0]));
+ uint8x8_t b = Load4<1>(src_block_[3], Load4(src_block_[2]));
+ Transpose4x4(&a, &b);
+ uint8_t output_4x4[4][4];
+ StoreLo4(output_4x4[0], a);
+ StoreLo4(output_4x4[1], b);
+ StoreHi4(output_4x4[2], a);
+ StoreHi4(output_4x4[3], b);
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+ 4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x4Test) {
+ uint8x8_t a0 = Load4<1>(src_block_[4], Load4(src_block_[0]));
+ uint8x8_t a1 = Load4<1>(src_block_[5], Load4(src_block_[1]));
+ uint8x8_t a2 = Load4<1>(src_block_[6], Load4(src_block_[2]));
+ uint8x8_t a3 = Load4<1>(src_block_[7], Load4(src_block_[3]));
+ Transpose8x4(&a0, &a1, &a2, &a3);
+ uint8_t output_8x4[4][8];
+ vst1_u8(output_8x4[0], a0);
+ vst1_u8(output_8x4[1], a1);
+ vst1_u8(output_8x4[2], a2);
+ vst1_u8(output_8x4[3], a3);
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x4[0],
+ 8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x8Test) {
+ uint8x8_t input_8x8[8];
+ for (int i = 0; i < 8; ++i) {
+ input_8x8[i] = vld1_u8(src_block_[i]);
+ }
+ Transpose8x8(input_8x8);
+ uint8_t output_8x8[8][8];
+ for (int i = 0; i < 8; ++i) {
+ vst1_u8(output_8x8[i], input_8x8[i]);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+ 8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x16Test) {
+ uint8x16_t input_8x16[8];
+ for (int i = 0; i < 8; ++i) {
+ input_8x16[i] =
+ vcombine_u8(vld1_u8(src_block_[i]), vld1_u8(src_block_[i + 8]));
+ }
+ Transpose8x16(input_8x16);
+ uint8_t output_16x8[8][16];
+ for (int i = 0; i < 8; ++i) {
+ vst1q_u8(output_16x8[i], input_8x16[i]);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_16x8[0],
+ 16, 8, kMaxBlockWidth, 16, false));
+}
+
+using TransposeTestHighBitdepth = TransposeTest<uint16_t>;
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x4Test) {
+ uint16x4_t input_4x4[4];
+ input_4x4[0] = vld1_u16(src_block_[0]);
+ input_4x4[1] = vld1_u16(src_block_[1]);
+ input_4x4[2] = vld1_u16(src_block_[2]);
+ input_4x4[3] = vld1_u16(src_block_[3]);
+ Transpose4x4(input_4x4);
+ uint16_t output_4x4[4][4];
+ for (int i = 0; i < 4; ++i) {
+ vst1_u16(output_4x4[i], input_4x4[i]);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+ 4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x8Test) {
+ uint16x8_t input_4x8[4];
+ for (int i = 0; i < 4; ++i) {
+ input_4x8[i] = vld1q_u16(src_block_[i]);
+ }
+ Transpose4x8(input_4x8);
+ uint16_t output_4x8[4][8];
+ for (int i = 0; i < 4; ++i) {
+ vst1q_u16(output_4x8[i], input_4x8[i]);
+ memcpy(&expected_transpose_[i][4], &expected_transpose_[i + 4][0],
+ 4 * sizeof(expected_transpose_[0][0]));
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x8[0],
+ 8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, LoopFilterTranspose4x8Test) {
+ uint16x8_t input_4x8[4];
+ for (int i = 0; i < 4; ++i) {
+ input_4x8[i] = vld1q_u16(src_block_[i]);
+ }
+ LoopFilterTranspose4x8(input_4x8);
+ uint16_t output_4x8[4][8];
+ for (int i = 0; i < 4; ++i) {
+ vst1q_u16(output_4x8[i], input_4x8[i]);
+ }
+ // a[0]: 03 13 23 33 04 14 24 34 p0q0
+ // a[1]: 02 12 22 32 05 15 25 35 p1q1
+ // a[2]: 01 11 21 31 06 16 26 36 p2q2
+ // a[3]: 00 10 20 30 07 17 27 37 p3q3
+ static constexpr uint16_t expected_output[4][8] = {
+ {0x03, 0x13, 0x23, 0x33, 0x04, 0x14, 0x24, 0x34},
+ {0x02, 0x12, 0x22, 0x32, 0x05, 0x15, 0x25, 0x35},
+ {0x01, 0x11, 0x21, 0x31, 0x06, 0x16, 0x26, 0x36},
+ {0x00, 0x10, 0x20, 0x30, 0x07, 0x17, 0x27, 0x37},
+ };
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_output[0], output_4x8[0], 8, 4,
+ 8, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8Test) {
+ uint16x8_t input_8x8[8];
+ for (int i = 0; i < 8; ++i) {
+ input_8x8[i] = vld1q_u16(src_block_[i]);
+ }
+ Transpose8x8(input_8x8);
+ uint16_t output_8x8[8][8];
+ for (int i = 0; i < 8; ++i) {
+ vst1q_u16(output_8x8[i], input_8x8[i]);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+ 8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8SignedTest) {
+ int16x8_t input_8x8[8];
+ for (int i = 0; i < 8; ++i) {
+ input_8x8[i] = vreinterpretq_s16_u16(vld1q_u16(src_block_[i]));
+ }
+ Transpose8x8(input_8x8);
+ uint16_t output_8x8[8][8];
+ for (int i = 0; i < 8; ++i) {
+ vst1q_u16(output_8x8[i], vreinterpretq_u16_s16(input_8x8[i]));
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+ 8, 8, kMaxBlockWidth, 8, false));
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+TEST(CommonDspTest, NEON) {
+ GTEST_SKIP()
+ << "Build this module for Arm with NEON enabled to enable the tests.";
+}
+
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Output of ConvolveTest.ShowRange below.
+// Bitdepth: 10 Input range: [ 0, 1023]
+// Horizontal base upscaled range: [ -28644, 94116]
+// Horizontal halved upscaled range: [ -14322, 47085]
+// Horizontal downscaled range: [ -7161, 23529]
+// Vertical upscaled range: [-1317624, 2365176]
+// Pixel output range: [ 0, 1023]
+// Compound output range: [ 3988, 61532]
+
+template <int num_taps>
+int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
+ const int16x4_t* const taps) {
+ const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
+ int32x4x2_t sum;
+ if (num_taps == 6) {
+ // 6 taps.
+ sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+
+ sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]);
+
+ sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+
+ sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+ } else {
+ // 4 taps.
+ sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+
+ sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+ sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+ }
+ return sum;
+}
+
+template <int num_taps>
+int32x4_t SumOnePassTaps(const uint16x4_t* const src,
+ const int16x4_t* const taps) {
+ const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
+ int32x4_t sum;
+ if (num_taps == 6) {
+ // 6 taps.
+ sum = vmull_s16(ssrc[0], taps[0]);
+ sum = vmlal_s16(sum, ssrc[1], taps[1]);
+ sum = vmlal_s16(sum, ssrc[2], taps[2]);
+ sum = vmlal_s16(sum, ssrc[3], taps[3]);
+ sum = vmlal_s16(sum, ssrc[4], taps[4]);
+ sum = vmlal_s16(sum, ssrc[5], taps[5]);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ sum = vmull_s16(ssrc[0], taps[0]);
+ sum = vmlal_s16(sum, ssrc[1], taps[1]);
+ sum = vmlal_s16(sum, ssrc[2], taps[2]);
+ sum = vmlal_s16(sum, ssrc[3], taps[3]);
+ sum = vmlal_s16(sum, ssrc[4], taps[4]);
+ sum = vmlal_s16(sum, ssrc[5], taps[5]);
+ sum = vmlal_s16(sum, ssrc[6], taps[6]);
+ sum = vmlal_s16(sum, ssrc[7], taps[7]);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ sum = vmull_s16(ssrc[0], taps[0]);
+ sum = vmlal_s16(sum, ssrc[1], taps[1]);
+ } else {
+ // 4 taps.
+ sum = vmull_s16(ssrc[0], taps[0]);
+ sum = vmlal_s16(sum, ssrc[1], taps[1]);
+ sum = vmlal_s16(sum, ssrc[2], taps[2]);
+ sum = vmlal_s16(sum, ssrc[3], taps[3]);
+ }
+ return sum;
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height,
+ const int16x4_t* const v_tap) {
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ if (is_2d) {
+ int x = 0;
+ do {
+ const uint16_t* s = src + x;
+ int y = height;
+ do { // Increasing loop counter x is better.
+ const uint16x8_t src_long = vld1q_u16(s);
+ const uint16x8_t src_long_hi = vld1q_u16(s + 8);
+ uint16x8_t v_src[8];
+ int32x4x2_t v_sum;
+ if (num_taps == 6) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+ v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+ v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+ v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+ v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+ }
+
+ const int16x4_t d0 =
+ vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+ const int16x4_t d1 =
+ vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+ vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+ vst1_u16(&dest16[4], vreinterpret_u16_s16(d1));
+ s += src_stride;
+ dest16 += 8;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+ return;
+ }
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint16x8_t src_long = vld1q_u16(src + x);
+ const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
+ uint16x8_t v_src[8];
+ int32x4x2_t v_sum;
+ if (num_taps == 6) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+ v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+ v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+ v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+ v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
+ v_src[0] = src_long;
+ v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+ v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+ v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+ }
+ if (is_compound) {
+ const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+ const int16x4_t d0 =
+ vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+ const int16x4_t d1 =
+ vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+ vst1_u16(&dest16[x],
+ vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+ vst1_u16(&dest16[x + 4],
+ vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ const int32x4_t v_first_shift_rounding_bit =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+ v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit);
+ v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit);
+ const uint16x4_t d0 = vmin_u16(
+ vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+ const uint16x4_t d1 = vmin_u16(
+ vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+ vst1_u16(&dest16[x], d0);
+ vst1_u16(&dest16[x + 4], d1);
+ }
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int height,
+ const int16x4_t* const v_tap) {
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ int y = height;
+ do {
+ const uint16x8_t v_zero = vdupq_n_u16(0);
+ uint16x4_t v_src[4];
+ int32x4_t v_sum;
+ const uint16x8_t src_long = vld1q_u16(src);
+ v_src[0] = vget_low_u16(src_long);
+ if (num_taps == 2) {
+ v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else {
+ v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+ v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
+ v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
+ }
+ if (is_compound || is_2d) {
+ const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+ if (is_compound && !is_2d) {
+ vst1_u16(&dest16[0], vreinterpret_u16_s16(
+ vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+ } else {
+ vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+ }
+ } else {
+ const int32x4_t v_first_shift_rounding_bit =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+ v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+ const uint16x4_t d0 =
+ vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+ vst1_u16(&dest16[0], d0);
+ }
+ src += src_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int num_taps, bool is_2d>
+void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int height,
+ const int16x4_t* const v_tap) {
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ int y = height >> 1;
+ do {
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src));
+ const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
+ const int16x8x2_t input = vzipq_s16(input0, input1);
+ int32x4_t v_sum;
+ if (num_taps == 2) {
+ v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
+ v_sum = vmlal_s16(v_sum,
+ vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
+ v_tap[4]);
+ } else {
+ v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]);
+ v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)),
+ v_tap[3]);
+ v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)),
+ v_tap[4]);
+ v_sum = vmlal_s16(v_sum,
+ vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)),
+ v_tap[5]);
+ }
+ if (is_2d) {
+ const uint16x4_t d0 = vreinterpret_u16_s16(
+ vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+ dest16[0] = vget_lane_u16(d0, 0);
+ dest16[1] = vget_lane_u16(d0, 2);
+ dest16 += pred_stride;
+ dest16[0] = vget_lane_u16(d0, 1);
+ dest16[1] = vget_lane_u16(d0, 3);
+ dest16 += pred_stride;
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ const int32x4_t v_first_shift_rounding_bit =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+ v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+ const uint16x4_t d0 =
+ vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+ dest16[0] = vget_lane_u16(d0, 0);
+ dest16[1] = vget_lane_u16(d0, 2);
+ dest16 += pred_stride;
+ dest16[0] = vget_lane_u16(d0, 1);
+ dest16[1] = vget_lane_u16(d0, 3);
+ dest16 += pred_stride;
+ }
+ src += src_stride << 1;
+ } while (--y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
+ int32x4_t v_sum;
+ if (num_taps == 2) {
+ v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
+ v_sum =
+ vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
+ } else {
+ v_sum = vmull_s16(vget_low_s16(input), v_tap[2]);
+ v_sum =
+ vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[3]);
+ v_sum =
+ vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 2)), v_tap[4]);
+ v_sum =
+ vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 3)), v_tap[5]);
+ }
+ const uint16x4_t d0 = vreinterpret_u16_s16(
+ vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, d0);
+ }
+}
+
+template <int num_taps, bool is_compound, bool is_2d>
+void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const int16x4_t* const v_tap) {
+ // Horizontal passes only needs to account for number of taps 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(num_taps == 2 || num_taps == 4);
+ if (num_taps == 2 || num_taps == 4) {
+ if (width == 2 && !is_compound) {
+ FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
+ return;
+ }
+ assert(width == 4);
+ FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
+ src, src_stride, dest, pred_stride, height, v_tap);
+ } else {
+ assert(false);
+ }
+}
+
+template <bool is_compound = false, bool is_2d = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
+ // Duplicate the absolute value for each tap. Negative taps are corrected
+ // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
+ int16x4_t v_tap[kSubPixelTaps];
+ assert(filter_id != 0);
+
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
+ }
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ if (width >= 8) {
+ if (filter_index == 2) { // 8 tap.
+ FilterHorizontalWidth8AndUp<8, is_compound, is_2d>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index < 2) { // 6 tap.
+ FilterHorizontalWidth8AndUp<6, is_compound, is_2d>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ assert(filter_index == 3);
+ FilterHorizontalWidth8AndUp<2, is_compound, is_2d>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+ } else {
+ if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
+ FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
+ dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ assert(filter_index == 3);
+ FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
+ dst_stride, width, height, v_tap);
+ }
+ }
+}
+
+void ConvolveHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* const src =
+ static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const ptrdiff_t dst_stride = pred_stride >> 1;
+
+ DoHorizontalPass(src, src_stride, dest, dst_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* const src =
+ static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+
+ DoHorizontalPass</*is_compound=*/true>(src, src_stride, dest, width, width,
+ height, horizontal_filter_id,
+ filter_index);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x4_t* const taps) {
+ const int next_row = num_taps - 1;
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint16_t* src_x = src + x;
+ uint16x8_t srcs[8];
+ srcs[0] = vld1q_u16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vld1q_u16(src_x);
+ src_x += src_stride;
+ srcs[2] = vld1q_u16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vld1q_u16(src_x);
+ src_x += src_stride;
+ srcs[4] = vld1q_u16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vld1q_u16(src_x);
+ src_x += src_stride;
+ srcs[6] = vld1q_u16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ // Decreasing the y loop counter produces worse code with clang.
+ // Don't unroll this loop since it generates too much code and the decoder
+ // is even slower.
+ int y = 0;
+ do {
+ srcs[next_row] = vld1q_u16(src_x);
+ src_x += src_stride;
+
+ const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+ if (is_compound) {
+ const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+ const int16x4_t d0 =
+ vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+ const int16x4_t d1 =
+ vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+ vst1_u16(dst16 + x + y * dst_stride,
+ vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+ vst1_u16(dst16 + x + 4 + y * dst_stride,
+ vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+ } else {
+ const uint16x4_t d0 = vmin_u16(
+ vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+ const uint16x4_t d1 = vmin_u16(
+ vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+ vst1_u16(dst16 + x + y * dst_stride, d0);
+ vst1_u16(dst16 + x + 4 + y * dst_stride, d1);
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x4_t* const taps) {
+ const int next_row = num_taps - 1;
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ uint16x4_t srcs[9];
+ srcs[0] = vld1_u16(src);
+ src += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vld1_u16(src);
+ src += src_stride;
+ srcs[2] = vld1_u16(src);
+ src += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vld1_u16(src);
+ src += src_stride;
+ srcs[4] = vld1_u16(src);
+ src += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vld1_u16(src);
+ src += src_stride;
+ srcs[6] = vld1_u16(src);
+ src += src_stride;
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = vld1_u16(src);
+ src += src_stride;
+ srcs[num_taps] = vld1_u16(src);
+ src += src_stride;
+
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+ const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps);
+ if (is_compound) {
+ const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+ const int16x4_t d1 =
+ vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1);
+ vst1_u16(dst16,
+ vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+ dst16 += dst_stride;
+ vst1_u16(dst16,
+ vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset))));
+ dst16 += dst_stride;
+ } else {
+ const uint16x4_t d0 =
+ vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+ const uint16x4_t d1 =
+ vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth);
+ vst1_u16(dst16, d0);
+ dst16 += dst_stride;
+ vst1_u16(dst16, d1);
+ dst16 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps>
+void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x4_t* const taps) {
+ const int next_row = num_taps - 1;
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ const uint16x4_t v_zero = vdup_n_u16(0);
+
+ uint16x4_t srcs[9];
+ srcs[0] = Load2<0>(src, v_zero);
+ src += src_stride;
+ if (num_taps >= 4) {
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load2<0>(src, v_zero);
+ src += src_stride;
+ srcs[1] = vext_u16(srcs[0], srcs[2], 2);
+ if (num_taps >= 6) {
+ srcs[2] = Load2<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load2<0>(src, v_zero);
+ src += src_stride;
+ srcs[3] = vext_u16(srcs[2], srcs[4], 2);
+ if (num_taps == 8) {
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load2<0>(src, v_zero);
+ src += src_stride;
+ srcs[5] = vext_u16(srcs[4], srcs[6], 2);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]);
+ src += src_stride;
+ srcs[num_taps] = Load2<0>(src, v_zero);
+ src += src_stride;
+ srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
+
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+ const uint16x4_t d0 =
+ vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+ Store2<0>(dst16, d0);
+ dst16 += dst_stride;
+ Store2<1>(dst16, d0);
+ dst16 += dst_stride;
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum_lo, sum_hi;
+ if (num_taps == 8) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+ }
+
+ if (is_compound) {
+ // Output is compound, so leave signed and do not saturate. Offset will
+ // accurately bring the value back into positive range.
+ return vcombine_s16(
+ vrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ vrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+ }
+
+ // Output is pixel, so saturate to clip at 0.
+ return vreinterpretq_s16_u16(
+ vcombine_u16(vqrshrun_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+ vqrshrun_n_s32(sum_hi, kInterRoundBitsVertical - 1)));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const int16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ int16x8_t srcs[9];
+ srcs[0] = vld1q_s16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[1] = vld1q_s16(src);
+ src += 8;
+ srcs[2] = vld1q_s16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[3] = vld1q_s16(src);
+ src += 8;
+ srcs[4] = vld1q_s16(src);
+ src += 8;
+ if (num_taps == 8) {
+ srcs[5] = vld1q_s16(src);
+ src += 8;
+ srcs[6] = vld1q_s16(src);
+ src += 8;
+ }
+ }
+ }
+
+ uint16_t* d16 = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = vld1q_s16(src);
+ src += 8;
+ srcs[next_row + 1] = vld1q_s16(src);
+ src += 8;
+ const int16x8_t sum0 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+ const int16x8_t sum1 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+ if (is_compound) {
+ const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+ vst1q_u16(d16,
+ vreinterpretq_u16_s16(vaddq_s16(sum0, v_compound_offset)));
+ d16 += dst_stride;
+ vst1q_u16(d16,
+ vreinterpretq_u16_s16(vaddq_s16(sum1, v_compound_offset)));
+ d16 += dst_stride;
+ } else {
+ vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum0), v_max_bitdepth));
+ d16 += dst_stride;
+ vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum1), v_max_bitdepth));
+ d16 += dst_stride;
+ }
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const int16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vld1q_s16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = vld1q_s16(src);
+ src += 8;
+ srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+ if (num_taps >= 6) {
+ srcs[4] = vld1q_s16(src);
+ src += 8;
+ srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+ if (num_taps == 8) {
+ srcs[6] = vld1q_s16(src);
+ src += 8;
+ srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[num_taps] = vld1q_s16(src);
+ src += 8;
+ srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+ vget_low_s16(srcs[num_taps]));
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+ vst1q_u16(dst16,
+ vreinterpretq_u16_s16(vaddq_s16(sum, v_compound_offset)));
+ dst16 += 4 << 1;
+ } else {
+ const uint16x8_t d0 =
+ vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+ vst1_u16(dst16, vget_low_u16(d0));
+ dst16 += dst_stride;
+ vst1_u16(dst16, vget_high_u16(d0));
+ dst16 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const int16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vld1q_s16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = vld1q_s16(src);
+ src += 8;
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ if (num_taps == 8) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = vld1q_s16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ } else if (num_taps == 4) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ } else if (num_taps == 6) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ } else if (num_taps == 8) {
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+ srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+ }
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const uint16x8_t d0 = vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+ Store2<0>(dst16, d0);
+ dst16 += dst_stride;
+ Store2<1>(dst16, d0);
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst16 += dst_stride;
+ Store2<2>(dst16, d0);
+ dst16 += dst_stride;
+ Store2<3>(dst16, d0);
+ dst16 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y -= 4;
+ } while (y != 0);
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const int16_t* LIBGAV1_RESTRICT const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t pred_stride) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width >= 8) {
+ Filter2DVerticalWidth8AndUp<vertical_taps>(
+ intermediate_result, dest, pred_stride, width, height, taps);
+ } else if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ } else {
+ assert(width == 2);
+ Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x43, sizeof(intermediate_result));
+#endif
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const auto* const src = static_cast<const uint16_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+ const ptrdiff_t dest_stride = pred_stride >> 1;
+
+ DoHorizontalPass</*is_compound=*/false, /*is_2d=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+ if (vertical_taps == 8) {
+ Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+ dest_stride);
+ } else if (vertical_taps == 6) {
+ Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+ dest_stride);
+ } else if (vertical_taps == 4) {
+ Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+ dest_stride);
+ } else { // |vertical_taps| == 2
+ Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+ dest_stride);
+ }
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+ const int16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+ const int height, const int16x8_t taps,
+ void* LIBGAV1_RESTRICT const prediction) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, height, taps);
+ } else {
+ Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, width, height, taps);
+ }
+}
+
+void ConvolveCompound2D_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t
+ intermediate_result[(kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1))];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const auto* const src = static_cast<const uint16_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+ if (vertical_taps == 8) {
+ Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+ } else if (vertical_taps == 6) {
+ Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+ } else if (vertical_taps == 4) {
+ Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+ } else { // |vertical_taps| == 2
+ Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+ }
+}
+
+void ConvolveVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const auto* src = static_cast<const uint16_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride >> 1;
+ assert(vertical_filter_id != 0);
+
+ int16x4_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 2) {
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 1) |
+ static_cast<int>(vertical_filter_id == 7) |
+ static_cast<int>(vertical_filter_id == 8) |
+ static_cast<int>(vertical_filter_id == 9) |
+ static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
+ if (width == 2) {
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 2) {
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps + 3);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+ // below map to 4 tap filters.
+ assert(filter_index == 5 || filter_index == 4 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 0 || vertical_filter_id == 2 ||
+ vertical_filter_id == 3 || vertical_filter_id == 4 ||
+ vertical_filter_id == 5 || vertical_filter_id == 6 ||
+ vertical_filter_id == 10 || vertical_filter_id == 11 ||
+ vertical_filter_id == 12 || vertical_filter_id == 13 ||
+ vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const auto* src = static_cast<const uint16_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ int16x4_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 4) {
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 1) |
+ static_cast<int>(vertical_filter_id == 7) |
+ static_cast<int>(vertical_filter_id == 8) |
+ static_cast<int>(vertical_filter_id == 9) |
+ static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
+ if (width == 4) {
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 4) {
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 3);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 3);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+ // to 4 tap filters.
+ assert(filter_index == 5 || filter_index == 4 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const auto* src = static_cast<const uint16_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int final_shift =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+ const uint16x8_t offset =
+ vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1)));
+
+ if (width >= 16) {
+ int y = height;
+ do {
+ int x = 0;
+ int w = width;
+ do {
+ const uint16x8_t v_src_lo = vld1q_u16(&src[x]);
+ const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]);
+ const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+ const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+ const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+ const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+ vst1q_u16(&dest[x], v_dest_lo);
+ vst1q_u16(&dest[x + 8], v_dest_hi);
+ x += 16;
+ w -= 16;
+ } while (w != 0);
+ src += src_stride;
+ dest += width;
+ } while (--y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint16x8_t v_src_lo = vld1q_u16(&src[0]);
+ const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]);
+ const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+ const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+ const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+ const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+ vst1q_u16(&dest[0], v_dest_lo);
+ vst1q_u16(&dest[8], v_dest_hi);
+ src += src_stride << 1;
+ dest += 16;
+ y -= 2;
+ } while (y != 0);
+ } else { // width == 4
+ int y = height;
+ do {
+ const uint16x4_t v_src_lo = vld1_u16(&src[0]);
+ const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]);
+ const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset));
+ const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset));
+ const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift);
+ const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift);
+ vst1_u16(&dest[0], v_dest_lo);
+ vst1_u16(&dest[4], v_dest_hi);
+ src += src_stride << 1;
+ dest += 8;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+inline void HalfAddHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+ uint16_t* LIBGAV1_RESTRICT const dst) {
+ const uint16x8_t left = vld1q_u16(src);
+ const uint16x8_t right = vld1q_u16(src + 1);
+ vst1q_u16(dst, vrhaddq_u16(left, right));
+}
+
+inline void HalfAddHorizontal16(const uint16_t* LIBGAV1_RESTRICT const src,
+ uint16_t* LIBGAV1_RESTRICT const dst) {
+ HalfAddHorizontal(src, dst);
+ HalfAddHorizontal(src + 8, dst + 8);
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ const int height,
+ uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal16(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal16(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+ const int /*subpixel_y*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint16_t*>(reference);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const ptrdiff_t dst_stride = pred_stride >> 1;
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dest);
+ src += src_stride;
+ dest += dst_stride;
+ } while (--y != 0);
+ } else { // width == 4
+ int y = height;
+ do {
+ uint16x4x2_t left;
+ uint16x4x2_t right;
+ left.val[0] = vld1_u16(src);
+ right.val[0] = vld1_u16(src + 1);
+ src += src_stride;
+ left.val[1] = vld1_u16(src);
+ right.val[1] = vld1_u16(src + 1);
+ src += src_stride;
+
+ vst1_u16(dest, vrhadd_u16(left.val[0], right.val[0]));
+ dest += dst_stride;
+ vst1_u16(dest, vrhadd_u16(left.val[1], right.val[1]));
+ dest += dst_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ uint16x8_t row[8], below[8];
+
+ row[0] = vld1q_u16(src);
+ if (width >= 16) {
+ src += 8;
+ row[1] = vld1q_u16(src);
+ if (width >= 32) {
+ src += 8;
+ row[2] = vld1q_u16(src);
+ src += 8;
+ row[3] = vld1q_u16(src);
+ if (width == 64) {
+ src += 8;
+ row[4] = vld1q_u16(src);
+ src += 8;
+ row[5] = vld1q_u16(src);
+ src += 8;
+ row[6] = vld1q_u16(src);
+ src += 8;
+ row[7] = vld1q_u16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = vld1q_u16(src);
+ if (width >= 16) {
+ src += 8;
+ below[1] = vld1q_u16(src);
+ if (width >= 32) {
+ src += 8;
+ below[2] = vld1q_u16(src);
+ src += 8;
+ below[3] = vld1q_u16(src);
+ if (width == 64) {
+ src += 8;
+ below[4] = vld1q_u16(src);
+ src += 8;
+ below[5] = vld1q_u16(src);
+ src += 8;
+ below[6] = vld1q_u16(src);
+ src += 8;
+ below[7] = vld1q_u16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ vst1q_u16(dst, vrhaddq_u16(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 16) {
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 32) {
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[2], below[2]));
+ row[2] = below[2];
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 64) {
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[4], below[4]));
+ row[4] = below[4];
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[5], below[5]));
+ row[5] = below[5];
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[6], below[6]));
+ row[6] = below[6];
+ dst += 8;
+ vst1q_u16(dst, vrhaddq_u16(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint16_t*>(reference);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const ptrdiff_t dst_stride = pred_stride >> 1;
+
+ if (width == 128) {
+ // Due to register pressure, process two 64xH.
+ for (int i = 0; i < 2; ++i) {
+ IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+ src += 64;
+ dest += 64;
+ }
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 8) {
+ IntraBlockCopyVertical<8>(src, src_stride, height, dest, dst_stride);
+ } else { // width == 4
+ uint16x4_t row = vld1_u16(src);
+ src += src_stride;
+ int y = height;
+ do {
+ const uint16x4_t below = vld1_u16(src);
+ src += src_stride;
+ vst1_u16(dest, vrhadd_u16(row, below));
+ dest += dst_stride;
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ uint16x8_t row[16];
+ row[0] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ if (width >= 16) {
+ src += 8;
+ row[1] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ if (width >= 32) {
+ src += 8;
+ row[2] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[3] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ if (width >= 64) {
+ src += 8;
+ row[4] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[5] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[6] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[7] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ if (width == 128) {
+ src += 8;
+ row[8] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[9] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[10] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[11] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[12] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[13] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[14] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ src += 8;
+ row[15] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const uint16x8_t below_0 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[0], below_0), 2));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_1 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[1], below_1), 2));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_2 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[2], below_2), 2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_3 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[3], below_3), 2));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_4 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[4], below_4), 2));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_5 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[5], below_5), 2));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_6 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[6], below_6), 2));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_7 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[7], below_7), 2));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_8 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[8], below_8), 2));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_9 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[9], below_9), 2));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_10 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[10], below_10), 2));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_11 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[11], below_11), 2));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_12 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[12], below_12), 2));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_13 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[13], below_13), 2));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_14 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[14], below_14), 2));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_15 =
+ vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+ vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[15], below_15), 2));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint16_t*>(reference);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const ptrdiff_t src_stride = reference_stride >> 1;
+ const ptrdiff_t dst_stride = pred_stride >> 1;
+
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, src_stride, height, dest, dst_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, src_stride, height, dest, dst_stride);
+ } else { // width == 4
+ uint16x4_t row0 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+ src += src_stride;
+
+ int y = height;
+ do {
+ const uint16x4_t row1 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+ src += src_stride;
+ const uint16x4_t row2 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+ src += src_stride;
+ const uint16x4_t result_01 = vrshr_n_u16(vadd_u16(row0, row1), 2);
+ const uint16x4_t result_12 = vrshr_n_u16(vadd_u16(row1, row2), 2);
+ vst1_u16(dest, result_01);
+ dest += dst_stride;
+ vst1_u16(dest, result_12);
+ dest += dst_stride;
+ row0 = row2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Scaled Convolve
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x16x3_t LoadSrcVals(const uint16_t* const src_x) {
+ uint8x16x3_t ret;
+ // When fractional step size is less than or equal to 1, the rightmost
+ // starting value for a filter may be at position 7. For an 8-tap filter, the
+ // rightmost value for the final tap may be at position 14. Therefore we load
+ // 2 vectors of eight 16-bit values.
+ ret.val[0] = vreinterpretq_u8_u16(vld1q_u16(src_x));
+ ret.val[1] = vreinterpretq_u8_u16(vld1q_u16(src_x + 8));
+#if LIBGAV1_MSAN
+ // Initialize to quiet msan warnings when grade_x <= 1.
+ ret.val[2] = vdupq_n_u8(0);
+#endif
+ if (grade_x > 1) {
+ // When fractional step size is greater than 1 (up to 2), the rightmost
+ // starting value for a filter may be at position 15. For an 8-tap filter,
+ // the rightmost value for the final tap may be at position 22. Therefore we
+ // load 3 vectors of eight 16-bit values.
+ ret.val[2] = vreinterpretq_u8_u16(vld1q_u16(src_x + 16));
+ }
+ return ret;
+}
+
+// Assemble 4 values corresponding to one tap position across multiple filters.
+// This is a simple case because maximum offset is 8 and only smaller filters
+// work on 4xH.
+inline uint16x4_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+ const uint8x8_t indices) {
+ const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+ return vreinterpret_u16_u8(VQTbl2U8(src_bytes2, indices));
+}
+
+// Assemble 8 values corresponding to one tap position across multiple filters.
+// This requires a lot of workaround on A32 architectures, so it may be worth
+// using an overall different algorithm for that architecture.
+template <int grade_x>
+inline uint16x8_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+ const uint8x16_t indices) {
+ if (grade_x == 1) {
+ const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+ return vreinterpretq_u16_u8(VQTbl2QU8(src_bytes2, indices));
+ }
+ return vreinterpretq_u16_u8(VQTbl3QU8(src_bytes, indices));
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+// Although the taps need to be converted to 16-bit values, they must be
+// arranged by table lookup, which is more expensive for larger types than
+// lengthening in-loop. |tap_index| refers to the index within a kernel applied
+// to a single value.
+inline int8x16_t GetPositive2TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr int8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+ return vld1q_s8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+ const int kernel_offset = 3;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ const int8x16_t filter_taps0 = GetPositive2TapFilter(0);
+ const int8x16_t filter_taps1 = GetPositive2TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+ int p = subpixel_x;
+ if (width <= 4) {
+ const uint16_t* src_y = src;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ // Each lane of lane of taps[k] corresponds to one output value along the
+ // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+ // filter_id depends on x.
+ const int16x4_t taps[2] = {
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices)))};
+ // Lower byte of Nth value is at position 2*N.
+ // Narrowing shift is not available here because the maximum shift
+ // parameter is 8.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ // Only 4 values needed.
+ const uint8x8_t src_indices = InterleaveLow8(src_indices0, src_indices1);
+ const uint8x8_t src_lookup[2] = {src_indices,
+ vadd_u8(src_indices, vdup_n_u8(2))};
+
+ int y = intermediate_height;
+ do {
+ const uint16_t* src_x =
+ src_y + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_x);
+ // Each lane corresponds to a different filter kernel.
+ const uint16x4_t src[2] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+ PermuteSrcVals(src_bytes, src_lookup[1])};
+
+ vst1_s16(intermediate,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_y = AddByteStride(src_y, src_stride);
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+
+ // |width| >= 8
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ do {
+ const uint16_t* src_x =
+ src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // Each lane of lane of taps[k] corresponds to one output value along the
+ // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+ // filter_id depends on x.
+ const int16x8_t taps[2] = {
+ vmovl_s8(VQTbl1S8(filter_taps0, filter_indices)),
+ vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))};
+ const int16x4_t taps_low[2] = {vget_low_s16(taps[0]),
+ vget_low_s16(taps[1])};
+ const int16x4_t taps_high[2] = {vget_high_s16(taps[0]),
+ vget_high_s16(taps[1])};
+ // Lower byte of Nth value is at position 2*N.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+ const uint8x16_t src_indices =
+ vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+ const uint8x16_t src_lookup[2] = {src_indices,
+ vaddq_u8(src_indices, vdupq_n_u8(2))};
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+ // Each lane corresponds to a different filter kernel.
+ const uint16x8_t src[2] = {
+ PermuteSrcVals<grade_x>(src_bytes, src_lookup[0]),
+ PermuteSrcVals<grade_x>(src_bytes, src_lookup[1])};
+ const uint16x4_t src_low[2] = {vget_low_u16(src[0]),
+ vget_low_u16(src[1])};
+ const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
+ vget_high_u16(src[1])};
+
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
+ // Avoid right shifting the stride.
+ src_x = AddByteStride(src_x, src_stride);
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline int8x16_t GetPositive4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(
+ 16) static constexpr int8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+ return vld1q_s8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalPositive4Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int8x16_t filter_taps0 = GetPositive4TapFilter(0);
+ const int8x16_t filter_taps1 = GetPositive4TapFilter(1);
+ const int8x16_t filter_taps2 = GetPositive4TapFilter(2);
+ const int8x16_t filter_taps3 = GetPositive4TapFilter(3);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+ int p = subpixel_x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ // Each lane of lane of taps[k] corresponds to one output value along the row,
+ // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+ // depends on x.
+ const int16x4_t taps[4] = {
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+ // Lower byte of Nth value is at position 2*N.
+ // Narrowing shift is not available here because the maximum shift
+ // parameter is 8.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ // Only 4 values needed.
+ const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+ uint8x8_t src_lookup[4];
+ const uint8x8_t two = vdup_n_u8(2);
+ src_lookup[0] = src_indices_base;
+ for (int i = 1; i < 4; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+ }
+
+ const uint16_t* src_y =
+ src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+ // Each lane corresponds to a different filter kernel.
+ const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+ PermuteSrcVals(src_bytes, src_lookup[1]),
+ PermuteSrcVals(src_bytes, src_lookup[2]),
+ PermuteSrcVals(src_bytes, src_lookup[3])};
+
+ vst1_s16(intermediate,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_y = AddByteStride(src_y, src_stride);
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline int8x16_t GetSigned4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(16) static constexpr int8_t
+ kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+ {-0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {-0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+
+ return vld1q_s8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT intermediate) {
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int8x16_t filter_taps0 = GetSigned4TapFilter(0);
+ const int8x16_t filter_taps1 = GetSigned4TapFilter(1);
+ const int8x16_t filter_taps2 = GetSigned4TapFilter(2);
+ const int8x16_t filter_taps3 = GetSigned4TapFilter(3);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ const int p = subpixel_x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ // Each lane of lane of taps[k] corresponds to one output value along the row,
+ // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+ // depends on x.
+ const int16x4_t taps[4] = {
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+ vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+ // Lower byte of Nth value is at position 2*N.
+ // Narrowing shift is not available here because the maximum shift
+ // parameter is 8.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ // Only 4 values needed.
+ const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+ uint8x8_t src_lookup[4];
+ const uint8x8_t two = vdup_n_u8(2);
+ src_lookup[0] = src_indices_base;
+ for (int i = 1; i < 4; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+ }
+
+ const uint16_t* src_y =
+ src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+ // Each lane corresponds to a different filter kernel.
+ const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+ PermuteSrcVals(src_bytes, src_lookup[1]),
+ PermuteSrcVals(src_bytes, src_lookup[2]),
+ PermuteSrcVals(src_bytes, src_lookup[3])};
+
+ vst1_s16(intermediate,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_y = AddByteStride(src_y, src_stride);
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline int8x16_t GetSigned6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr int8_t
+ kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+ {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {-0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {-0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+ return vld1q_s8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ int8x16_t filter_taps[6];
+ for (int i = 0; i < 6; ++i) {
+ filter_taps[i] = GetSigned6TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint16_t* src_x =
+ src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+
+ // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+ // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+ // where filter_id depends on x.
+ int16x4_t taps_low[6];
+ int16x4_t taps_high[6];
+ for (int i = 0; i < 6; ++i) {
+ const int16x8_t taps_i =
+ vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+ taps_low[i] = vget_low_s16(taps_i);
+ taps_high[i] = vget_high_s16(taps_i);
+ }
+
+ // Lower byte of Nth value is at position 2*N.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+ const uint8x16_t src_indices_base =
+ vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+ uint8x16_t src_lookup[6];
+ const uint8x16_t two = vdupq_n_u8(2);
+ src_lookup[0] = src_indices_base;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+ }
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+ uint16x4_t src_low[6];
+ uint16x4_t src_high[6];
+ for (int i = 0; i < 6; ++i) {
+ const uint16x8_t src_i =
+ PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+ src_low[i] = vget_low_u16(src_i);
+ src_high[i] = vget_high_u16(src_i);
+ }
+
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
+ // Avoid right shifting the stride.
+ src_x = AddByteStride(src_x, src_stride);
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps depending on the filter id.
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr int8_t
+ kAbsHalfSubPixel6TapMixedFilterColumns[6][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+ return vld1q_s8(kAbsHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ int8x16_t filter_taps[6];
+ for (int i = 0; i < 6; ++i) {
+ filter_taps[i] = GetMixed6TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint16_t* src_x =
+ src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+ // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+ // where filter_id depends on x.
+ int16x4_t taps_low[6];
+ int16x4_t taps_high[6];
+ for (int i = 0; i < 6; ++i) {
+ const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+ taps_low[i] = vget_low_s16(taps);
+ taps_high[i] = vget_high_s16(taps);
+ }
+
+ // Lower byte of Nth value is at position 2*N.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+ const uint8x16_t src_indices_base =
+ vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+ uint8x16_t src_lookup[6];
+ const uint8x16_t two = vdupq_n_u8(2);
+ src_lookup[0] = src_indices_base;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+ }
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+ uint16x4_t src_low[6];
+ uint16x4_t src_high[6];
+ for (int i = 0; i < 6; ++i) {
+ const uint16x8_t src_i =
+ PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+ src_low[i] = vget_low_u16(src_i);
+ src_high[i] = vget_high_u16(src_i);
+ }
+
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
+ // Avoid right shifting the stride.
+ src_x = AddByteStride(src_x, src_stride);
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline int8x16_t GetSigned8TapFilter(const int tap_index) {
+ assert(tap_index < 8);
+ alignas(16) static constexpr int8_t
+ kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+ {-0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {-0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3,
+ -1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {-0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6,
+ -3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {-0, -0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+
+ return vld1q_s8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+ const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ int8x16_t filter_taps[8];
+ for (int i = 0; i < 8; ++i) {
+ filter_taps[i] = GetSigned8TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint16_t* src_x = src + (p >> kScaleSubPixelBits) - ref_x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+
+ // Lower byte of Nth value is at position 2*N.
+ const uint8x8_t src_indices0 = vshl_n_u8(
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+ // Upper byte of Nth value is at position 2*N+1.
+ const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+ const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+ const uint8x16_t src_indices_base =
+ vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+ uint8x16_t src_lookup[8];
+ const uint8x16_t two = vdupq_n_u8(2);
+ src_lookup[0] = src_indices_base;
+ for (int i = 1; i < 8; ++i) {
+ src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+ }
+ // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+ // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+ // where filter_id depends on x.
+ int16x4_t taps_low[8];
+ int16x4_t taps_high[8];
+ for (int i = 0; i < 8; ++i) {
+ const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+ taps_low[i] = vget_low_s16(taps);
+ taps_high[i] = vget_high_s16(taps);
+ }
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+ uint16x4_t src_low[8];
+ uint16x4_t src_high[8];
+ for (int i = 0; i < 8; ++i) {
+ const uint16x8_t src_i =
+ PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+ src_low[i] = vget_low_u16(src_i);
+ src_high[i] = vget_high_u16(src_i);
+ }
+
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
+ // Avoid right shifting the stride.
+ src_x = AddByteStride(src_x, src_stride);
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum;
+ if (num_taps == 8) {
+ sum = vmull_lane_s16(src[0], taps_lo, 0);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+ sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum = vmull_lane_s16(src[0], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum = vmull_lane_s16(src[0], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum = vmull_lane_s16(src[0], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+ }
+
+ return vreinterpret_s16_u16(vqrshrun_n_s32(sum, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale2Or4xH(const int16_t* LIBGAV1_RESTRICT const src,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ static_assert(width == 2 || width == 4, "");
+ // We increment stride with the 8-bit pointer and then reinterpret to avoid
+ // shifting |dest_stride|.
+ auto* dest_y = static_cast<uint16_t*>(dest);
+ // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+ // than bytes.
+ auto* compound_dest_y = static_cast<uint16_t*>(dest);
+ // This stride always corresponds to int16_t.
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ const int16_t* src_y = src;
+ int16x4_t s[num_taps + grade_y];
+
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = height;
+ do {
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ assert(width != 2);
+ // This offset potentially overflows into the sign bit, but should yield
+ // the correct unsigned value.
+ const uint16x4_t result =
+ vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+ vst1_u16(compound_dest_y, result);
+ compound_dest_y += dest_stride;
+ } else {
+ const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+ vdup_n_u16((1 << kBitdepth10) - 1));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ vst1_u16(dest_y, result);
+ }
+ dest_y = AddByteStride(dest_y, dest_stride);
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ prev_p = p;
+ // Here we load extra source in case it is needed. If |p_diff| == 0, these
+ // values will be unused, but it's faster to load than to branch.
+ s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+ }
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result =
+ vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+ vst1_u16(compound_dest_y, result);
+ compound_dest_y += dest_stride;
+ } else {
+ const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+ vdup_n_u16((1 << kBitdepth10) - 1));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ vst1_u16(dest_y, result);
+ }
+ dest_y = AddByteStride(dest_y, dest_stride);
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+ const int intermediate_height, const int width,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ // This stride always corresponds to int16_t.
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+
+ int16x8_t s[num_taps + 2];
+
+ const int16_t* src = source;
+ int x = 0;
+ do {
+ const int16_t* src_y = src;
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ // We increment stride with the 8-bit pointer and then reinterpret to avoid
+ // shifting |dest_stride|.
+ auto* dest_y = static_cast<uint16_t*>(dest) + x;
+ // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+ // than bytes.
+ auto* compound_dest_y = static_cast<uint16_t*>(dest) + x;
+ int y = height;
+ do {
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1q_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x8_t sums =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ // This offset potentially overflows int16_t, but should yield the
+ // correct unsigned value.
+ const uint16x8_t result = vreinterpretq_u16_s16(
+ vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+ vst1q_u16(compound_dest_y, result);
+ compound_dest_y += dest_stride;
+ } else {
+ const uint16x8_t result = vminq_u16(
+ vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+ vst1q_u16(dest_y, result);
+ dest_y = AddByteStride(dest_y, dest_stride);
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ prev_p = p;
+ // Here we load extra source in case it is needed. If |p_diff| == 0, these
+ // values will be unused, but it's faster to load than to branch.
+ s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+ }
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sums = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x8_t result = vreinterpretq_u16_s16(
+ vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+ vst1q_u16(compound_dest_y, result);
+ compound_dest_y += dest_stride;
+ } else {
+ const uint16x8_t result = vminq_u16(
+ vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+ vst1q_u16(dest_y, result);
+ dest_y = AddByteStride(dest_y, dest_stride);
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+
+ y -= 2;
+ } while (y != 0);
+ src += kIntermediateStride * intermediate_height;
+ x += 8;
+ } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x,
+ const int step_y, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ assert(step_y <= 2048);
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+ int16_t intermediate_result[kIntermediateAllocWidth *
+ (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x54, sizeof(intermediate_result));
+#endif
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // The same applies to height and vertical filter index.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint16_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src = AddByteStride(src, vert_kernel_offset * src_stride);
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte (8-value) load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base subpel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // larger structure and use a larger table lookup in order to gather all
+ // filter inputs.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ // |num_taps| - 1 is the shuffle index of the final filter input.
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+
+ switch (filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned6Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned6Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned8Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned8Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ break;
+ default:
+ assert(filter_index == 5);
+ ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ }
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ switch (filter_index) {
+ case 0:
+ case 1:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<6, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<6, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<6, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<6, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ case 2:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<8, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<8, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<8, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<8, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ case 3:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<2, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<2, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<2, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<2, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ default:
+ assert(filter_index == 4 || filter_index == 5);
+ assert(height <= 4);
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<4, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<4, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale2Or4xH<4, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale2Or4xH<4, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+ dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+} // namespace
+
+void ConvolveInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+ const uint8x8_t* const taps) {
+ uint16x8_t sum;
+ if (filter_index == 0) {
+ // 6 taps. + - + + - +
+ sum = vmull_u8(src[0], taps[0]);
+ // Unsigned overflow will result in a valid int16_t value.
+ sum = vmlsl_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlsl_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1 && negative_outside_taps) {
+ // 6 taps. - + + + + -
+ // Set a base we can subtract from.
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1) {
+ // 6 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 2) {
+ // 8 taps. - + - + + - + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlsl_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ sum = vmlal_u8(sum, src[6], taps[6]);
+ sum = vmlsl_u8(sum, src[7], taps[7]);
+ } else if (filter_index == 3) {
+ // 2 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ } else if (filter_index == 4) {
+ // 4 taps. - + + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlsl_u8(sum, src[3], taps[3]);
+ } else if (filter_index == 5) {
+ // 4 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ }
+ return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height,
+ const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ if (!is_2d) {
+ int y = height;
+ do {
+ int x = 0;
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(src + x);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_compound) {
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+ vst1q_u16(&dest16[x], v_sum);
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+ vst1_u8(&dest8[x], result);
+ }
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ } else {
+ int x = 0;
+ do {
+ const uint8_t* s = src + x;
+ int y = height;
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(s);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+ vst1q_u16(dest16, v_sum);
+ s += src_stride;
+ dest16 += 8;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int height,
+ const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ uint8x8_t v_src[4];
+ int16x8_t sum;
+ v_src[0] = vld1_u8(src);
+ if (filter_index == 3) {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+ v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_2d || is_compound) {
+ const uint16x4_t v_sum = vreinterpret_u16_s16(
+ vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+ vst1_u16(dest16, v_sum);
+ } else {
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int height,
+ const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height >> 1;
+ do {
+ const uint8x8_t input0 = vld1_u8(src);
+ const uint8x8_t input1 = vld1_u8(src + src_stride);
+ const uint8x8x2_t input = vzip_u8(input0, input1);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ // tap signs : + +
+ sum = vmull_u8(input.val[0], v_tap[3]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+ } else if (filter_index == 4) {
+ // tap signs : - + + -
+ sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ } else {
+ // tap signs : + + + +
+ sum = vmull_u8(input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ }
+ int16x8_t s = vreinterpretq_s16_u16(sum);
+ if (is_2d) {
+ const uint16x8_t v_sum =
+ vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+ dest16[0] = vgetq_lane_u16(v_sum, 0);
+ dest16[1] = vgetq_lane_u16(v_sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(v_sum, 1);
+ dest16[1] = vgetq_lane_u16(v_sum, 3);
+ dest16 += pred_stride;
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+ dest8[0] = vget_lane_u8(result, 0);
+ dest8[1] = vget_lane_u8(result, 2);
+ dest8 += pred_stride;
+ dest8[0] = vget_lane_u8(result, 1);
+ dest8[1] = vget_lane_u8(result, 3);
+ dest8 += pred_stride;
+ }
+ src += src_stride << 1;
+ } while (--y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ const uint8x8_t input = vld1_u8(src);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ sum = vmull_u8(input, v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const uint8x8_t* const v_tap) {
+ assert(width < 8 || filter_index <= 3);
+ // Don't simplify the redundant if conditions with the template parameters,
+ // which helps the compiler generate compact code.
+ if (width >= 8 && filter_index <= 3) {
+ FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+ is_compound>(src, src_stride, dest, pred_stride,
+ width, height, v_tap);
+ return;
+ }
+
+ // Horizontal passes only needs to account for number of taps 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(filter_index >= 3 && filter_index <= 5);
+ if (filter_index >= 3 && filter_index <= 5) {
+ if (width == 2 && !is_compound) {
+ FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
+ return;
+ }
+ assert(width == 4);
+ FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+ src, src_stride, dest, pred_stride, height, v_tap);
+ }
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum;
+ if (num_taps == 8) {
+ sum = vmull_lane_s16(src[0], taps_lo, 0);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+ sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum = vmull_lane_s16(src[0], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum = vmull_lane_s16(src[0], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum = vmull_lane_s16(src[0], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+ }
+
+ return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum_lo, sum_hi;
+ if (num_taps == 8) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vcombine_s16(
+ vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps == 8) {
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ }
+ }
+ }
+
+ uint8_t* d8 = dst8 + x;
+ uint16_t* d16 = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ const int16x8_t sum0 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+ const int16x8_t sum1 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+ if (is_compound) {
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+ d16 += dst_stride;
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+ d16 += dst_stride;
+ } else {
+ vst1_u8(d8, vqmovun_s16(sum0));
+ d8 += dst_stride;
+ vst1_u8(d8, vqmovun_s16(sum1));
+ d8 += dst_stride;
+ }
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+ if (num_taps >= 6) {
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+ if (num_taps == 8) {
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+ vget_low_s16(srcs[num_taps]));
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = vreinterpretq_u16_s16(sum);
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqmovun_s16(sum);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ if (num_taps == 8) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ } else if (num_taps == 4) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ } else if (num_taps == 6) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ } else if (num_taps == 8) {
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+ srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+ }
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const uint8x8_t results = vqmovun_s16(sum);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
+ // Duplicate the absolute value for each tap. Negative taps are corrected
+ // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
+ uint8x8_t v_tap[kSubPixelTaps];
+ assert(filter_id != 0);
+
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+ }
+
+ if (filter_index == 2) { // 8 tap.
+ FilterHorizontal<2, true, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ // Check if outside taps are positive.
+ if ((filter_id == 1) | (filter_id == 15)) {
+ FilterHorizontal<1, false, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+ } else {
+ FilterHorizontal<1, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+ } else if (filter_index == 0) { // 6 tap.
+ FilterHorizontal<0, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ FilterHorizontal<4, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<5, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ FilterHorizontal<3, true, is_2d, is_compound>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(
+ const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+ const int height, const int16x8_t taps,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ auto* const dest = static_cast<uint8_t*>(prediction);
+ if (width >= 8) {
+ Filter2DVerticalWidth8AndUp<vertical_taps>(
+ intermediate_result, dest, pred_stride, width, height, taps);
+ } else if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ } else {
+ assert(width == 2);
+ Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+ if (vertical_taps == 8) {
+ Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
+ } else if (vertical_taps == 6) {
+ Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
+ } else if (vertical_taps == 4) {
+ Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
+ } else { // |vertical_taps| == 2
+ Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
+ uint8x8x3_t ret;
+ const uint8x16_t src_val = vld1q_u8(src_x);
+ ret.val[0] = vget_low_u8(src_val);
+ ret.val[1] = vget_high_u8(src_val);
+#if LIBGAV1_MSAN
+ // Initialize to quiet msan warnings when grade_x <= 1.
+ ret.val[2] = vdup_n_u8(0);
+#endif
+ if (grade_x > 1) {
+ ret.val[2] = vld1_u8(src_x + 16);
+ }
+ return ret;
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+ return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = 3;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
+ const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+ int p = subpixel_x;
+ if (width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ // This is a special case. The 2-tap filter has no negative taps, so we
+ // can use unsigned values.
+ // For each x, a lane of tapsK has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+ // For each x, a lane of srcK contains src_x[k].
+ const uint8x8_t src[2] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // This is a special case. The 2-tap filter has no negative taps, so we
+ // can use unsigned values.
+ // For each x, a lane of tapsK has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+ // For each x, a lane of srcK contains src_x[k].
+ const uint8x8_t src[2] = {
+ vtbl3_u8(src_vals, src_indices),
+ vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+ return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+void ConvolveKernelHorizontalPositive4Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT intermediate) {
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
+ const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
+ const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
+ const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const int p = subpixel_x;
+ // First filter is special, just a 128 tap on the center.
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices = vand_u8(
+ vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
+ // Note that filter_id depends on x.
+ // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
+
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped index vectors.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+
+ // For each x, srcK contains src_x[k] where k=1.
+ // Whereas taps come from different arrays, src pixels are drawn from the
+ // same contiguous line.
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
+ assert(tap_index < 4);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+ {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
+
+ return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT intermediate) {
+ const int kernel_offset = 2;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
+ const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
+ const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
+ const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
+ const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+ static_cast<uint16_t>(step_x));
+
+ const int p = subpixel_x;
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+ const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+ const uint8x8_t filter_index_offsets = vshrn_n_u16(
+ vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
+ const uint8x8_t filter_indices =
+ vand_u8(filter_index_offsets, filter_index_mask);
+ // Note that filter_id depends on x.
+ // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
+
+ const uint8x8_t src_indices_base =
+ vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+ const uint8x8_t src_indices[4] = {src_indices_base,
+ vadd_u8(src_indices_base, vdup_n_u8(1)),
+ vadd_u8(src_indices_base, vdup_n_u8(2)),
+ vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x16_t src_vals = vld1q_u8(src_x);
+
+ // For each x, srcK contains src_x[k] where k=1.
+ // Whereas taps come from different arrays, src pixels are drawn from the
+ // same contiguous line.
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+ VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
+
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+ {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x16_t filter_taps[6];
+ for (int i = 0; i < 6; ++i) {
+ filter_taps[i] = GetSigned6TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ // Avoid overloading outside the reference boundaries. This means
+ // |trailing_width| can be up to 24.
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ uint8x8_t taps[6];
+ for (int i = 0; i < 6; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[6] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+ return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[4];
+ int16x8_t mixed_taps[2];
+ uint8x16_t positive_filter_taps[4];
+ for (int i = 0; i < 4; ++i) {
+ positive_filter_taps[i] = GetPositive6TapFilter(i);
+ }
+ int8x16_t mixed_filter_taps[2];
+ mixed_filter_taps[0] = GetMixed6TapFilter(0);
+ mixed_filter_taps[1] = GetMixed6TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 4; ++i) {
+ taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+ }
+ mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+ mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ int16x8_t sum_mixed = vmulq_s16(
+ mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+ sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+ ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+ uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+ sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+ sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+ sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+ sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+ vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+ assert(tap_index < 8);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+ {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ const int width, const int subpixel_x, const int step_x,
+ const int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT const intermediate) {
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[8];
+ uint8x16_t filter_taps[8];
+ for (int i = 0; i < 8; ++i) {
+ filter_taps[i] = GetSigned8TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[8];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 8; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 8; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+
+ int y = intermediate_height;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[8] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+ vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ auto* dest16_y = static_cast<uint16_t*>(dest);
+ auto* dest_y = static_cast<uint8_t*>(dest);
+ int16x4_t s[num_taps + grade_y];
+
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = height;
+ do {
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ prev_p = p;
+ // Here we load extra source in case it is needed. If |p_diff| == 0, these
+ // values will be unused, but it's faster to load than to branch.
+ s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+ const int intermediate_height,
+ const int width, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ // A possible improvement is to use arithmetic to decide how many times to
+ // apply filters to same source before checking whether to load new srcs.
+ // However, this will only improve performance with very small step sizes.
+ int16x8_t s[num_taps + grade_y];
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ uint16_t* dest16_y;
+ uint8_t* dest_y;
+ const int16_t* src = source;
+
+ int x = 0;
+ do {
+ const int16_t* src_y = src;
+ dest16_y = static_cast<uint16_t*>(dest) + x;
+ dest_y = static_cast<uint8_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = height;
+ do {
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1q_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+ // needed. Otherwise, we only need to load one vector because |p_diff|
+ // can't exceed 1.
+ s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+ y -= 2;
+ } while (y != 0);
+ src += kIntermediateStride * intermediate_height;
+ x += 8;
+ } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x,
+ const int step_y, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ assert(step_y <= 2048);
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kIntermediateAllocWidth *
+ (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base subpel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // larger structure and use a larger table lookup in order to gather all
+ // filter inputs.
+ // |num_taps| - 1 is the shuffle index of the final filter input.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned6Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned6Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned8Tap<2>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontalSigned8Tap<1>(
+ src, src_stride, width, subpixel_x, step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ break;
+ default:
+ assert(filter_index == 5);
+ ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+ intermediate_height, intermediate);
+ }
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+
+ switch (filter_index) {
+ case 0:
+ case 1:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ case 2:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ case 3:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ break;
+ case 4:
+ default:
+ assert(filter_index == 4 || filter_index == 5);
+ assert(height <= 4);
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 1, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+ }
+}
+
+void ConvolveHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint8_t*>(prediction);
+
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ uint8x8_t srcs[8];
+ srcs[0] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[2] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[4] = vld1_u8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[6] = vld1_u8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ // Decreasing the y loop counter produces worse code with clang.
+ // Don't unroll this loop since it generates too much code and the decoder
+ // is even slower.
+ int y = 0;
+ do {
+ srcs[next_row] = vld1_u8(src_x);
+ src_x += src_stride;
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+ vst1q_u16(dst16 + x + y * dst_stride, results);
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+ vst1_u8(dst8 + x + y * dst_stride, results);
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ int y = height;
+ do {
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ srcs[6] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ int y = height;
+ do {
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4<0>(src, srcs[6]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4(src);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ int y = height;
+ do {
+ srcs[6] = Load4<1>(src, srcs[6]);
+ src += src_stride;
+ srcs[8] = Load4<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+ // This uses srcs[0]..srcs[1].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ // This uses srcs[0]..srcs[3].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+ // This uses srcs[0]..srcs[5].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+ // This uses srcs[0]..srcs[7].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+void ConvolveVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* const dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 1) |
+ static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap.
+ if (width == 2) {
+ FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 7) |
+ static_cast<int>(vertical_filter_id == 8) |
+ static_cast<int>(vertical_filter_id == 9))) !=
+ 0) { // 6 tap with weird negative taps.
+ if (width == 2) {
+ FilterVertical2xH<1,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/false,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ // Outside taps are negative.
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+ // below map to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int final_shift =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+ if (width >= 16) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint8x16_t v_src = vld1q_u8(&src[x]);
+ const uint16x8_t v_dest_lo =
+ vshll_n_u8(vget_low_u8(v_src), final_shift);
+ const uint16x8_t v_dest_hi =
+ vshll_n_u8(vget_high_u8(v_src), final_shift);
+ vst1q_u16(&dest[x], v_dest_lo);
+ x += 8;
+ vst1q_u16(&dest[x], v_dest_hi);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ dest += width;
+ } while (--y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
+ src += src_stride;
+ dest += width;
+ } while (--y != 0);
+ } else { // width == 4
+ uint8x8_t v_src = vdup_n_u8(0);
+
+ int y = height;
+ do {
+ v_src = Load4<0>(&src[0], v_src);
+ src += src_stride;
+ v_src = Load4<1>(&src[0], v_src);
+ src += src_stride;
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
+ dest += 4 << 1;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void ConvolveCompoundVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+ }
+
+ if (filter_index == 0) { // 6 tap.
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 1) |
+ static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((static_cast<int>(filter_index == 1) &
+ (static_cast<int>(vertical_filter_id == 7) |
+ static_cast<int>(vertical_filter_id == 8) |
+ static_cast<int>(vertical_filter_id == 9))) !=
+ 0) { // 6 tap with weird negative taps.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true,
+ /*negative_outside_taps=*/true>(src, src_stride, dest,
+ 4, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, width, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 3);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
+ } else {
+ // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+ // to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
+ }
+}
+
+void ConvolveCompoundHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+ const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+ const int height, const int16x8_t taps,
+ void* LIBGAV1_RESTRICT const prediction) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, height, taps);
+ } else {
+ Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, width, height, taps);
+ }
+}
+
+void ConvolveCompound2D_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+ if (vertical_taps == 8) {
+ Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+ } else if (vertical_taps == 6) {
+ Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+ } else if (vertical_taps == 4) {
+ Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+ } else { // |vertical_taps| == 2
+ Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+ uint8_t* LIBGAV1_RESTRICT const dst) {
+ const uint8x16_t left = vld1q_u8(src);
+ const uint8x16_t right = vld1q_u8(src + 1);
+ vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+ const int /*subpixel_y*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint8x8_t left = vld1_u8(src);
+ const uint8x8_t right = vld1_u8(src + 1);
+ vst1_u8(dest, vrhadd_u8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else { // width == 4
+ uint8x8_t left = vdup_n_u8(0);
+ uint8x8_t right = vdup_n_u8(0);
+ int y = height;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint8x8_t result = vrhadd_u8(left, right);
+
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ uint8x16_t row[8], below[8];
+
+ row[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = vld1q_u8(src);
+ src += 16;
+ row[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = vld1q_u8(src);
+ src += 16;
+ row[5] = vld1q_u8(src);
+ src += 16;
+ row[6] = vld1q_u8(src);
+ src += 16;
+ row[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = vld1q_u8(src);
+ src += 16;
+ below[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = vld1q_u8(src);
+ src += 16;
+ below[5] = vld1q_u8(src);
+ src += 16;
+ below[6] = vld1q_u8(src);
+ src += 16;
+ below[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ uint8x8_t row, below;
+ row = vld1_u8(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = vld1_u8(src);
+ src += reference_stride;
+
+ vst1_u8(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else { // width == 4
+ uint8x8_t row = Load4(src);
+ uint8x8_t below = vdup_n_u8(0);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = Load4<0>(src, below);
+ src += reference_stride;
+
+ StoreLo4(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ uint16x8_t row[16];
+ row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 16) {
+ src += 8;
+ row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 32) {
+ src += 8;
+ row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 64) {
+ src += 8;
+ row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width == 128) {
+ src += 8;
+ row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_10 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_11 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_12 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_13 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_14 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_15 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else { // width == 4
+ uint8x8_t left = Load4(src);
+ uint8x8_t right = Load4(src + 1);
+ src += reference_stride;
+
+ uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+ int y = height;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint16x8_t below = vaddl_u8(left, right);
+
+ const uint8x8_t result = vrshrn_n_u16(
+ vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+
+ row = vget_high_u16(below);
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+ dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve. This function is not thread-safe.
+void ConvolveInit_NEON();
+void ConvolveInit10bpp_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+constexpr int kInterPostRoundBit = 4;
+
+namespace low_bitdepth {
+namespace {
+
+inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+ const int16x8_t pred1,
+ const int16x8_t weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const int16x8_t diff = vsubq_s16(pred0, pred1);
+ // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4)
+ const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit);
+}
+
+template <int width>
+inline void DistanceWeightedBlendSmall_NEON(
+ const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int height,
+ const int16x8_t weight, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ constexpr int step = 16 / width;
+
+ int y = height;
+ do {
+ const int16x8_t src_00 = vld1q_s16(prediction_0);
+ const int16x8_t src_10 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight);
+
+ const int16x8_t src_01 = vld1q_s16(prediction_0);
+ const int16x8_t src_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
+ const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight);
+
+ if (width == 4) {
+ StoreLo4(dst, result0);
+ dst += dest_stride;
+ StoreHi4(dst, result0);
+ dst += dest_stride;
+ StoreLo4(dst, result1);
+ dst += dest_stride;
+ StoreHi4(dst, result1);
+ dst += dest_stride;
+ } else {
+ assert(width == 8);
+ vst1_u8(dst, result0);
+ dst += dest_stride;
+ vst1_u8(dst, result1);
+ dst += dest_stride;
+ }
+ y -= step;
+ } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_NEON(
+ const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight,
+ const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+ const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
+ const uint8x8_t res_lo =
+ ComputeWeightedAverage8(src0_lo, src1_lo, weight);
+
+ const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+ const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
+ const uint8x8_t res_hi =
+ ComputeWeightedAverage8(src0_hi, src1_hi, weight);
+
+ const uint8x16_t result = vcombine_u8(res_lo, res_hi);
+ vst1q_u8(dst + x, result);
+ x += 16;
+ } while (x < width);
+ dst += dest_stride;
+ prediction_0 += width;
+ prediction_1 += width;
+ } while (--y != 0);
+}
+
+inline void DistanceWeightedBlend_NEON(
+ const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
+ const uint8_t /*weight_1*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ // Upscale the weight for vqdmulh.
+ const int16x8_t weight = vdupq_n_s16(weight_0 << 11);
+ if (width == 4) {
+ DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
+ return;
+ }
+
+ if (width == 8) {
+ DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
+ return;
+ }
+
+ DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest,
+ dest_stride);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+ const uint16x4x2_t pred1,
+ const uint16x4_t weights[2]) {
+ const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+ const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+ const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x2_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+ return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+ const uint16x4x4_t pred1,
+ const uint16x4_t weights[2]) {
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+ const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+ const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+ const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+ const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+ const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+ const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+ const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+ const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+ const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x4_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+ result.val[2] =
+ vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+ result.val[3] =
+ vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+ return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+ uint16x4x2_t x;
+ // gcc/clang (64 bit) optimizes the following to ldp.
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+ uint16x4x4_t x;
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ x.val[2] = vld1_u16(ptr + 8);
+ x.val[3] = vld1_u16(ptr + 12);
+ return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+
+ if (width == 4) {
+ int y = height;
+ do {
+ const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+ const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+ const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + dst_stride, res.val[1]);
+ dst += dst_stride << 1;
+ pred_0 += 8;
+ pred_1 += 8;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + 4, res.val[1]);
+ vst1_u16(dst + dst_stride, res.val[2]);
+ vst1_u16(dst + dst_stride + 4, res.val[3]);
+ dst += dst_stride << 1;
+ pred_0 += 16;
+ pred_1 += 16;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst + x, res.val[0]);
+ vst1_u16(dst + x + 4, res.val[1]);
+ vst1_u16(dst + x + 8, res.val[2]);
+ vst1_u16(dst + x + 12, res.val[3]);
+ x += 16;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If NEON is enabled signal the NEON implementation should be used instead of
+// normal C.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+ return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+ return ZeroExtend(vld1_u8(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
+ return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+ vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+ return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
+ return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+ vst1q_u16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+ const int16x8_t grain_hi,
+ int16_t coeff, int32x4x2_t sum) {
+ const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+ sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+ sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+ return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
+ int32x4x2_t sum,
+ const int8_t* LIBGAV1_RESTRICT coeffs,
+ int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
+ int32x4x2_t sum,
+ const int8_t* LIBGAV1_RESTRICT coeffs,
+ int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+ int8_t* LIBGAV1_RESTRICT u_grain_cursor,
+ int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+ int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+ const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+ int16_t* LIBGAV1_RESTRICT u_grain_cursor,
+ int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+ int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+ const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+ v->val[0] = vdupq_n_s32(0);
+ v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ const int8x16_t src0 = vld1q_s8(luma);
+ const int8x16_t src1 = vld1q_s8(luma + stride);
+ const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+ vpaddl_s8(vget_high_s8(src0)));
+ const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+ vpaddl_s8(vget_high_s8(src1)));
+ return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int8x16_t src = vld1q_s8(luma);
+ return vrshrq_n_s16(
+ vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+ 1);
+ }
+ return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const uint8x16_t src = vld1q_u8(luma);
+ return vrshrq_n_u16(vpaddlq_u8(src), 1);
+ }
+ return vmovl_u8(vld1_u8(luma));
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
+ int subsampling_x, int valid_range) {
+ if (subsampling_x != 0) {
+ const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
+ // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
+ return vrshrq_n_u16(vpaddlq_u8(src), 1);
+ }
+ return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ int16x8_t src0_lo = vld1q_s16(luma);
+ int16x8_t src0_hi = vld1q_s16(luma + 8);
+ const int16x8_t src1_lo = vld1q_s16(luma + stride);
+ const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+ const int16x8_t src0 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+ vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+ const int16x8_t src1 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+ vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+ return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int16x8_t src_lo = vld1q_s16(luma);
+ const int16x8_t src_hi = vld1q_s16(luma + 8);
+ const int16x8_t ret =
+ vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+ vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+ return vrshrq_n_s16(ret, 1);
+ }
+ return vld1q_s16(luma);
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
+ int subsampling_x) {
+ if (subsampling_x != 0) {
+ const uint16x8x2_t src = vld2q_u16(luma);
+ return vrhaddq_u16(src.val[0], src.val[1]);
+ }
+ return vld1q_u16(luma);
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
+ int subsampling_x, int valid_range) {
+ if (subsampling_x != 0) {
+ const uint16x8x2_t src = vld2q_u16(luma);
+ const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
+ return MaskOverreadsQ(result, 16 - valid_range);
+ }
+ return Load1QMsanU16(luma, 16 - valid_range);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+ bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(
+ const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+ int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+ void* LIBGAV1_RESTRICT v_grain_buffer) {
+ static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
+ const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+ auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+ auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+ const int auto_regression_shift = params.auto_regression_shift;
+ const int chroma_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int chroma_height =
+ (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+ // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
+ // leaving [35, 40] to write at the end.
+ const int chroma_width_remainder =
+ (chroma_width - 2 * kAutoRegressionBorder) & 7;
+
+ int y = kAutoRegressionBorder;
+ luma_grain += kLumaWidth * y;
+ u_grain += chroma_width * y;
+ v_grain += chroma_width * y;
+ do {
+ // Each row is computed 8 values at a time in the following loop. At the
+ // end of the loop, 4 values remain to write. They are given a special
+ // reduced iteration at the end.
+ int x = kAutoRegressionBorder;
+ int luma_x = kAutoRegressionBorder;
+ do {
+ int pos = 0;
+ int32x4x2_t sum_u;
+ int32x4x2_t sum_v;
+ SetZero(&sum_u);
+ SetZero(&sum_v);
+
+ if (auto_regression_coeff_lag > 0) {
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called
+ // on the final row of a grain block. Therefore, they will never
+ // exceed the block boundaries.
+ // Note: this could be slightly optimized to a single load in 8bpp,
+ // but requires making a special first iteration and accumulate
+ // function that takes an int8x16_t.
+ const int16x8_t u_grain_lo =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag);
+ const int16x8_t u_grain_hi =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+ const int16x8_t v_grain_lo =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag);
+ const int16x8_t v_grain_hi =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
+ sum_u = AccumulateWeightedGrain<offset>( \
+ u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
+ sum_v = AccumulateWeightedGrain<offset>( \
+ v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ }
+
+ if (use_luma) {
+ const int16x8_t luma = GetSubsampledLuma(
+ luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+ // Luma samples get the final coefficient in the formula, but are best
+ // computed all at once before the final row.
+ const int coeff_u =
+ params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+ const int coeff_v =
+ params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+ sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+ sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+ sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+ sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+ }
+ // At this point in the filter, the source addresses and destination
+ // addresses overlap. Because this is an auto-regressive filter, the
+ // higher lanes cannot be computed without the results of the lower lanes.
+ // Each call to WriteFinalAutoRegression incorporates preceding values
+ // on the final row, and writes a single sample. This allows the next
+ // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane) \
+ WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>( \
+ u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
+ params.auto_regression_coeff_v, pos, auto_regression_shift)
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ WRITE_AUTO_REGRESSION_RESULT(6);
+ WRITE_AUTO_REGRESSION_RESULT(7);
+
+ x += 8;
+ luma_x += 8 << subsampling_x;
+ } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
+
+ // This is the "final iteration" of the above loop over width. We fill in
+ // the remainder of the width, which is less than 8.
+ int pos = 0;
+ int32x4x2_t sum_u;
+ int32x4x2_t sum_v;
+ SetZero(&sum_u);
+ SetZero(&sum_v);
+
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called on
+ // the final row of a grain block. Therefore, they will never exceed the
+ // block boundaries.
+ const int16x8_t u_grain_lo = GetSignedSource8(
+ u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+ const int16x8_t u_grain_hi =
+ GetSignedSource8(u_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+ const int16x8_t v_grain_lo = GetSignedSource8(
+ v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+ const int16x8_t v_grain_hi =
+ GetSignedSource8(v_grain + x + delta_row * chroma_width -
+ auto_regression_coeff_lag + 8);
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+
+ if (use_luma) {
+ const int16x8_t luma = GetSubsampledLuma(
+ luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+ // Luma samples get the final coefficient in the formula, but are best
+ // computed all at once before the final row.
+ const int coeff_u =
+ params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+ const int coeff_v =
+ params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+ sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+ sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+ sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+ sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+ }
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ if (chroma_width_remainder == 6) {
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ }
+
+ luma_grain += kLumaWidth << subsampling_y;
+ u_grain += chroma_width;
+ v_grain += chroma_width;
+ } while (++y < chroma_height);
+#undef ACCUMULATE_WEIGHTED_GRAIN
+#undef WRITE_AUTO_REGRESSION_RESULT
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
+void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
+ void* luma_grain_buffer) {
+ static_assert(auto_regression_coeff_lag > 0, "");
+ const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
+ const uint8_t auto_regression_shift = params.auto_regression_shift;
+
+ int y = kAutoRegressionBorder;
+ auto* luma_grain =
+ static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
+ do {
+ // Each row is computed 8 values at a time in the following loop. At the
+ // end of the loop, 4 values remain to write. They are given a special
+ // reduced iteration at the end.
+ int x = kAutoRegressionBorder;
+ do {
+ int pos = 0;
+ int32x4x2_t sum;
+ SetZero(&sum);
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ // These loads may overflow to the next row, but they are never called
+ // on the final row of a grain block. Therefore, they will never exceed
+ // the block boundaries.
+ const int16x8_t src_grain_lo =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag);
+ const int16x8_t src_grain_hi =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag + 8);
+
+ // A pictorial representation of the auto-regressive filter for
+ // various values of params.auto_regression_coeff_lag. The letter 'O'
+ // represents the current sample. (The filter always operates on the
+ // current sample with filter coefficient 1.) The letters 'X'
+ // represent the neighboring samples that the filter operates on, below
+ // their corresponding "offset" number.
+ //
+ // params.auto_regression_coeff_lag == 3:
+ // 0 1 2 3 4 5 6
+ // X X X X X X X
+ // X X X X X X X
+ // X X X X X X X
+ // X X X O
+ // params.auto_regression_coeff_lag == 2:
+ // 0 1 2 3 4
+ // X X X X X
+ // X X X X X
+ // X X O
+ // params.auto_regression_coeff_lag == 1:
+ // 0 1 2
+ // X X X
+ // X O
+ // params.auto_regression_coeff_lag == 0:
+ // O
+ // The function relies on the caller to skip the call in the 0 lag
+ // case.
+
+#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
+ sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
+ auto_regression_coeff_y[pos++], sum)
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ // At this point in the filter, the source addresses and destination
+ // addresses overlap. Because this is an auto-regressive filter, the
+ // higher lanes cannot be computed without the results of the lower lanes.
+ // Each call to WriteFinalAutoRegression incorporates preceding values
+ // on the final row, and writes a single sample. This allows the next
+ // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane) \
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
+ luma_grain + x, sum, auto_regression_coeff_y, pos, \
+ auto_regression_shift)
+
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ WRITE_AUTO_REGRESSION_RESULT(4);
+ WRITE_AUTO_REGRESSION_RESULT(5);
+ WRITE_AUTO_REGRESSION_RESULT(6);
+ WRITE_AUTO_REGRESSION_RESULT(7);
+ x += 8;
+ // Leave the final four pixels for the special iteration below.
+ } while (x < kLumaWidth - kAutoRegressionBorder - 4);
+
+ // Final 4 pixels in the row.
+ int pos = 0;
+ int32x4x2_t sum;
+ SetZero(&sum);
+ for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+ ++delta_row) {
+ const int16x8_t src_grain_lo = GetSignedSource8(
+ luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
+ const int16x8_t src_grain_hi =
+ GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+ auto_regression_coeff_lag + 8);
+
+ ACCUMULATE_WEIGHTED_GRAIN(0);
+ ACCUMULATE_WEIGHTED_GRAIN(1);
+ ACCUMULATE_WEIGHTED_GRAIN(2);
+ // The horizontal |auto_regression_coeff_lag| loop is replaced with
+ // if-statements to give vextq_s16 an immediate param.
+ if (auto_regression_coeff_lag > 1) {
+ ACCUMULATE_WEIGHTED_GRAIN(3);
+ ACCUMULATE_WEIGHTED_GRAIN(4);
+ }
+ if (auto_regression_coeff_lag > 2) {
+ assert(auto_regression_coeff_lag == 3);
+ ACCUMULATE_WEIGHTED_GRAIN(5);
+ ACCUMULATE_WEIGHTED_GRAIN(6);
+ }
+ }
+ // delta_row == 0
+ WRITE_AUTO_REGRESSION_RESULT(0);
+ WRITE_AUTO_REGRESSION_RESULT(1);
+ WRITE_AUTO_REGRESSION_RESULT(2);
+ WRITE_AUTO_REGRESSION_RESULT(3);
+ luma_grain += kLumaWidth;
+ } while (++y < kLumaHeight);
+
+#undef WRITE_AUTO_REGRESSION_RESULT
+#undef ACCUMULATE_WEIGHTED_GRAIN
+}
+
+template <int bitdepth>
+void InitializeScalingLookupTable_NEON(int num_points,
+ const uint8_t point_value[],
+ const uint8_t point_scaling[],
+ int16_t* scaling_lut,
+ const int scaling_lut_length) {
+ static_assert(bitdepth < kBitdepth12,
+ "NEON Scaling lookup table only supports 8bpp and 10bpp.");
+ if (num_points == 0) {
+ memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+ return;
+ }
+ static_assert(sizeof(scaling_lut[0]) == 2, "");
+ Memset(scaling_lut, point_scaling[0],
+ (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
+ const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
+ const int32x4_t rounding = vdupq_n_s32(32768);
+ for (int i = 0; i < num_points - 1; ++i) {
+ const int delta_y = point_scaling[i + 1] - point_scaling[i];
+ const int delta_x = point_value[i + 1] - point_value[i];
+ // |delta| corresponds to b, for the function y = a + b*x.
+ const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+ const int delta4 = delta << 2;
+ // vmull_n_u16 will not work here because |delta| typically exceeds the
+ // range of uint16_t.
+ int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
+ const int32x4_t line_increment4 = vdupq_n_s32(delta4);
+ // Get the second set of 4 points by adding 4 steps to the first set.
+ int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
+ // We obtain the next set of 8 points by adding 8 steps to each of the
+ // current 8 points.
+ const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
+ const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
+ int x = 0;
+ // Derive and write 8 values (or 32 values, for 10bpp).
+ do {
+ const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
+ const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
+ const int16x8_t interp_points =
+ vcombine_s16(interp_points0, interp_points1);
+ // The spec guarantees that the max value of |point_value[i]| + x is 255.
+ // Writing 8 values starting at the final table byte, leaves 7 values of
+ // required padding.
+ const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
+ const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
+ if (bitdepth == kBitdepth10) {
+ const int16x8_t next_val = vaddq_s16(
+ base_point,
+ vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
+ const int16x8_t start = full_interp;
+ const int16x8_t end = vextq_s16(full_interp, next_val, 1);
+ // lut[i << 2] = start;
+ // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
+ // lut[(i << 2) + 2] = start +
+ // RightShiftWithRounding(2 * (start - end), 2)
+ // lut[(i << 2) + 3] = start +
+ // RightShiftWithRounding(3 * (start - end), 2)
+ const int16x8_t delta = vsubq_s16(end, start);
+ const int16x8_t double_delta = vshlq_n_s16(delta, 1);
+ const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
+ const int16x8_t delta3 =
+ vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
+ const int16x8x4_t result = {
+ start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
+ vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
+ Store4QMsanS16(&scaling_lut[x_base], result);
+ } else {
+ vst1q_s16(&scaling_lut[x_base], full_interp);
+ }
+ upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
+ upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
+ x += 8;
+ } while (x < delta_x);
+ }
+ const int16_t last_point_value = point_value[num_points - 1];
+ const int x_base = last_point_value << (bitdepth - kBitdepth8);
+ Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+ scaling_lut_length - x_base);
+ if (bitdepth == kBitdepth10 && x_base > 0) {
+ const int start = scaling_lut[x_base - 4];
+ const int end = point_scaling[num_points - 1];
+ const int delta = end - start;
+ scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
+ scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
+ scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
+ }
+}
+
+inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
+ const int16x8_t high) {
+ const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+ return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+ const Pixel* source,
+ const int valid_range = 8) {
+ int16_t start_vals[8];
+ static_assert(bitdepth <= kBitdepth10,
+ "NEON Film Grain is not yet implemented for 12bpp.");
+#if LIBGAV1_MSAN
+ if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
+#endif
+ for (int i = 0; i < valid_range; ++i) {
+ assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return vld1q_s16(start_vals);
+}
+
+template <int bitdepth>
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+ const int16x8_t scaling_shift_vect) {
+ if (bitdepth == kBitdepth8) {
+ const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+ return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+ }
+ // Scaling shift is in the range [8, 11]. The doubling multiply returning high
+ // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
+ // provide a left shift equal to 15 - s, where s is the original shift
+ // parameter.
+ const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
+ return vqrdmulhq_s16(noise, scaling_up);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_NEON(
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_luma);
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(
+ (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+ const int safe_width = width & ~15;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_width; x += 8) {
+ // This operation on the unsigned input is safe in 8bpp because the vector
+ // is widened before it is reinterpreted.
+ const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling0 =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
+ const int16x8_t combined0 = vaddq_s16(orig0, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case, though the gain would be very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
+ x += 8;
+
+ // This operation on the unsigned input is safe in 8bpp because the vector
+ // is widened before it is reinterpreted.
+ const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling1 =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
+ const int16x8_t combined1 = vaddq_s16(orig1, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case, though the gain would be very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
+ }
+
+ if (x < width) {
+ assert(width - x < 16);
+ if (x < width - 8) {
+ const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can
+ // replace clipping with vqmovun_s16, but it's not likely to be worth
+ // copying the function for just that case, though the gain would be
+ // very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ x += 8;
+ }
+ const int valid_range_pixels = width - x;
+ const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
+ const int16x8_t orig =
+ GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut_y, &in_y_row[x], valid_range_pixels);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ }
+ in_y_row += source_stride_y;
+ out_y_row += dest_stride_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline int16x8_t BlendChromaValsWithCfl(
+ const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+ const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+ const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
+ const int16x8_t orig = GetSignedSource8(chroma_cursor);
+ int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+ return vaddq_s16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
+ const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+ Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ Pixel luma_buffer[16];
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(
+ (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+
+ // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+ // in GetScalingFactors.
+ Pixel average_luma_buffer[8];
+ assert(start_height % 2 == 0);
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const uint16x8_t average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ StoreUnsigned8(average_luma_buffer, average_luma);
+
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const int16x8_t blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
+
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ }
+
+ if (x < chroma_width) {
+ const int luma_x = x << subsampling_x;
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const uint16x8_t average_luma = GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
+
+ StoreUnsigned8(average_luma_buffer, average_luma);
+
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
+ const int16x8_t blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_NEON(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+ // Looping over one plane at a time is faster in higher resolutions, despite
+ // re-computing luma.
+ BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+ source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t BlendChromaValsNoCfl(
+ const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+ const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+ const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
+ uint8_t merged_buffer[8];
+ const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
+ const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
+ // Maximum value of |combined_u| is 127*255 = 0x7E81.
+ const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
+ // Maximum value of u_offset is (255 << 5) = 0x1FE0.
+ // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
+ const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
+ vst1_u8(merged_buffer, merged);
+
+ const int16x8_t scaling =
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+ int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
+ return vaddq_s16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
+ const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+ // for 16 bit signed integers. In higher bitdepths, however, we have to
+ // expand to 32 to protect the sign bit.
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+ uint8_t luma_buffer[16];
+ const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
+
+ const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can
+ // replace clipping with vqmovun_s16, but the gain would be small.
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ }
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_pixels = chroma_width - x;
+
+ const int16x8_t orig_chroma =
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ // End of right edge iteration.
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_NEON(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
+ height, start_height, subsampling_x, subsampling_y,
+ params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y,
+ in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+inline void WriteOverlapLine8bpp_NEON(
+ const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
+ const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+ const int8x8_t grain_coeff, const int8x8_t old_coeff,
+ int8_t* LIBGAV1_RESTRICT noise_image_row) {
+ int x = 0;
+ do {
+ // Note that these reads may exceed noise_stripe_row's width by up to 7
+ // bytes.
+ const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
+ const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
+ const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
+ const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
+ // Note that this write may exceed noise_image_row's width by up to 7 bytes.
+ vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
+ x += 8;
+ } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap8bpp_NEON(
+ const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
+ const int8x8_t first_row_old_coeff = vdup_n_s8(27);
+ const int8x8_t second_row_grain_coeff = first_row_old_coeff;
+ const int8x8_t second_row_old_coeff = first_row_grain_coeff;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ const int remaining_height = plane_height - y;
+ if (remaining_height <= 0) {
+ return;
+ }
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ if (remaining_height > 1) {
+ WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ } else { // subsampling_y == 1
+ const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
+ const int8x8_t first_row_old_coeff = vdup_n_s8(23);
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine8bpp_NEON(
+ noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
+
+ // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
+ // Chroma autoregression should never be called when lag is 0 and use_luma
+ // is false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
+ false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
+ false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
+ false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
+
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap8bpp_NEON;
+
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_NEON<kBitdepth8>;
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline void WriteOverlapLine10bpp_NEON(
+ const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
+ const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+ const int16x8_t grain_coeff, const int16x8_t old_coeff,
+ int16_t* LIBGAV1_RESTRICT noise_image_row) {
+ int x = 0;
+ do {
+ // Note that these reads may exceed noise_stripe_row's width by up to 7
+ // values.
+ const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
+ const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
+ // Maximum product is 511 * 27 = 0x35E5.
+ const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
+ // Maximum sum is 511 * (22 + 23) = 0x59D3.
+ const int16x8_t grain_sum =
+ vmlaq_s16(weighted_grain, old_coeff, source_old);
+ // Note that this write may exceed noise_image_row's width by up to 7
+ // values.
+ const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
+ vdupq_n_s16(GetGrainMin<kBitdepth10>()),
+ vdupq_n_s16(GetGrainMax<kBitdepth10>()));
+ vst1q_s16(noise_image_row + x, grain);
+ x += 8;
+ } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap10bpp_NEON(
+ const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
+ const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
+ const int16x8_t second_row_grain_coeff = first_row_old_coeff;
+ const int16x8_t second_row_old_coeff = first_row_grain_coeff;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ const int remaining_height = plane_height - y;
+ if (remaining_height <= 0) {
+ return;
+ }
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ if (remaining_height > 1) {
+ WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ } else { // subsampling_y == 1
+ const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
+ const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+ }
+ }
+}
+
+inline int16x8_t BlendChromaValsNoCfl(
+ const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+ const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+ const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
+ uint16_t merged_buffer[8];
+ const int32x4_t weighted_luma_low =
+ vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
+ const int32x4_t weighted_luma_high =
+ vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
+ // Maximum value of combined is 127 * 1023 = 0x1FB81.
+ const int32x4_t combined_low =
+ vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
+ const int32x4_t combined_high =
+ vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
+ // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
+ const uint16x4_t merged_low =
+ vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
+ const uint16x4_t merged_high =
+ vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
+ const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
+ vst1q_u16(merged_buffer,
+ vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
+ const int16x8_t scaling =
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
+ merged_buffer);
+ const int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ const int16x8_t scaled_noise =
+ ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
+ return vaddq_s16(orig, scaled_noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
+ const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+ uint16_t luma_buffer[16];
+ // Offset is added before downshifting in order to take advantage of
+ // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
+ const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const int16x8_t average_luma = vreinterpretq_s16_u16(
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+ const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ }
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_pixels = chroma_width - x;
+ const int valid_range_chroma_bytes =
+ (chroma_width - x) * sizeof(in_chroma_row[0]);
+ const int16x8_t orig_chroma =
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ // End of right edge iteration.
+ }
+
+ in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
+ in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
+ out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma10bpp_NEON(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int16_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane10bpp_NEON(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+ source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
+
+ // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
+ // Chroma autoregression should never be called when lag is 0 and use_luma
+ // is false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+ false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+ false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+ false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
+ true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+ true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+ true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+ true>;
+
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap10bpp_NEON;
+
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_NEON<kBitdepth10>;
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace film_grain
+
+void FilmGrainInit_NEON() {
+ film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
+// required.
+constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint8_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0f0e0d0c0b0a0908));
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint8_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint8x16_t src_0 = vld1q_u8(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 15; i += 16) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+ uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+ sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+ sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+ uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+ sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+ sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ // Load the next row before overwriting. This loads an extra 15 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u8(dst_buffer + i + 15);
+
+ vst1q_u8(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0xf;
+ if (remainder > 1) {
+ const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+
+ uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+ sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+ sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+ uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+ sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+ sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint8_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
+ uint8x16_t src_1 = vld1q_u8(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 15; i += 16) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+ const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+ uint16x8_t sum_lo =
+ vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+ const uint16x8_t sum_123_lo = vaddw_u8(
+ vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+ sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+ uint16x8_t sum_hi =
+ vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+ const uint16x8_t sum_123_hi =
+ vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+ vget_high_u8(src_3));
+ sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+ src_0 = vld1q_u8(dst_buffer + i + 14);
+ src_1 = vld1q_u8(dst_buffer + i + 15);
+
+ vst1q_u8(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0xf;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+ const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+ const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+ uint16x8_t sum_lo =
+ vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+ const uint16x8_t sum_123_lo = vaddw_u8(
+ vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+ sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+ uint16x8_t sum_hi =
+ vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+ const uint16x8_t sum_123_hi =
+ vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+ vget_high_u8(src_3));
+ sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+ const uint8x16_t result =
+ vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
+uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
+ const uint8x8_t src2, const uint8x8_t src3) {
+ const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
+ const uint16x8_t ends = vaddl_u8(src0, src3);
+ const int16x8_t sum =
+ vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
+ return vqrshrun_n_s16(sum, 4);
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+ // This is OK because we don't read this value for |size| 4 or 8 but if we
+ // write |pixel_buffer[size]| and then vld() it, that seems to introduce
+ // some latency.
+ pixel_buffer[-2] = pixel_buffer[-1];
+ if (size == 4) {
+ // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
+ const uint8x8_t src = vld1_u8(pixel_buffer - 1);
+ // The outside values are negated so put those in the same vector.
+ const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
+ // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
+ // end.
+ const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
+
+ const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
+ const int16x8_t half_sum = vsubq_s16(
+ vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
+ const int16x4_t sum =
+ vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
+ const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
+
+ vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
+ return;
+ }
+ if (size == 8) {
+ // Likewise, one load + multiple vtbls seems preferred to multiple loads.
+ const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
+ const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
+ const uint8x8_t src1 = vget_low_u8(src);
+ const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
+ const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
+
+ const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
+ vst2_u8(pixel_buffer - 1, output);
+ return;
+ }
+ assert(size == 12 || size == 16);
+ // Extend the input borders to avoid branching later.
+ pixel_buffer[size] = pixel_buffer[size - 1];
+ const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
+ const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
+ const uint8x16_t src2 = vld1q_u8(pixel_buffer);
+ const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
+
+ const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
+ vget_low_u8(src2), vget_low_u8(src3));
+
+ const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
+ vst2_u8(pixel_buffer - 1, output_lo);
+
+ const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
+ vget_high_u8(src2), vget_high_u8(src3));
+
+ if (size == 12) {
+ vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
+ } else /* size == 16 */ {
+ const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
+ vst2_u8(pixel_buffer + 15, output_hi);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+ {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint16_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+ const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ // Load the next row before overwriting. This loads an extra 7 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u16(dst_buffer + i + 7);
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0x7;
+ if (remainder > 1) {
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint16_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+ uint16x8_t src_1 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+ // Load the next before overwriting.
+ src_0 = vld1q_u16(dst_buffer + i + 6);
+ src_1 = vld1q_u16(dst_buffer + i + 7);
+
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0x7;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
+
+ // Extend first/last samples
+ pixel_buffer[-2] = pixel_buffer[-1];
+ pixel_buffer[size] = pixel_buffer[size - 1];
+
+ const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+ const int16x8_t src_hi =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+ const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+ const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+ int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+ sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+ sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+ sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+ uint16x8x2_t result_lo;
+ result_lo.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+ if (size > 8) {
+ const int16x8_t src_hi_extra =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+ const int16x8_t src9_hi_extra =
+ vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+ int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+ sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+ sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+ sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+ uint16x8x2_t result_hi;
+ result_hi.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_hi.val[1] =
+ vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ vst2q_u16(pixel_buffer + 15, result_hi);
+ } else {
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Divide by the number of elements.
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
+ return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
+}
+
+// Subtract |val| from every element in |a|.
+inline void BlockSubtract(const uint32_t val,
+ int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int width, const int height) {
+ assert(val <= INT16_MAX);
+ const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
+
+ for (int y = 0; y < height; ++y) {
+ if (width == 4) {
+ const int16x4_t b = vld1_s16(a[y]);
+ vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
+ } else if (width == 8) {
+ const int16x8_t b = vld1q_s16(a[y]);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ } else if (width == 16) {
+ const int16x8_t b = vld1q_s16(a[y]);
+ const int16x8_t c = vld1q_s16(a[y] + 8);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+ } else /* block_width == 32 */ {
+ const int16x8_t b = vld1q_s16(a[y]);
+ const int16x8_t c = vld1q_s16(a[y] + 8);
+ const int16x8_t d = vld1q_s16(a[y] + 16);
+ const int16x8_t e = vld1q_s16(a[y] + 24);
+ vst1q_s16(a[y], vsubq_s16(b, val_v));
+ vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+ vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
+ vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
+ }
+ }
+}
+
+namespace low_bitdepth {
+namespace {
+
+template <int block_width, int block_height>
+void CflSubsampler420_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t sum;
+ if (block_width == 4) {
+ assert(max_luma_width >= 8);
+ uint32x2_t running_sum = vdup_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x8_t row0 = vld1_u8(src);
+ const uint8x8_t row1 = vld1_u8(src + stride);
+
+ uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
+ sum_row = vshl_n_u16(sum_row, 1);
+ running_sum = vpadal_u16(running_sum, sum_row);
+ vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));
+
+ if (y << 1 < max_luma_height - 2) {
+ // Once this threshold is reached the loop could be simplified.
+ src += stride << 1;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else if (block_width == 8) {
+ const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+ const uint16x8_t x_max_index =
+ vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x16_t row0 = vld1q_u8(src);
+ const uint8x16_t row1 = vld1q_u8(src + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
+
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
+
+ if (y << 1 < max_luma_height - 2) {
+ src += stride << 1;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else /* block_width >= 16 */ {
+ const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ // Calculate the 2x2 sum at the max_luma offset
+ const uint8_t a00 = src[max_luma_width - 2];
+ const uint8_t a01 = src[max_luma_width - 1];
+ const uint8_t a10 = src[max_luma_width - 2 + stride];
+ const uint8_t a11 = src[max_luma_width - 1 + stride];
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_n_u16(static_cast<uint16_t>((a00 + a01 + a10 + a11) << 1));
+ uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+ ptrdiff_t src_x_offset = 0;
+ for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+ const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+ const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
+ x_index = vaddq_u16(x_index, vdupq_n_u16(16));
+ }
+
+ if (y << 1 < max_luma_height - 2) {
+ src += stride << 1;
+ }
+ }
+ sum = SumVector(running_sum);
+ }
+
+ const uint32_t average = Average(sum, block_width, block_height);
+ BlockSubtract(average, luma, block_width, block_height);
+}
+
+template <int block_width, int block_height>
+void CflSubsampler444_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t sum;
+ if (block_width == 4) {
+ assert(max_luma_width >= 4);
+ assert(max_luma_height <= block_height);
+ assert((max_luma_height % 2) == 0);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+ uint8x8_t row = vdup_n_u8(0);
+
+ uint16x8_t row_shifted;
+ int y = 0;
+ do {
+ row = Load4<0>(src, row);
+ row = Load4<1>(src + stride, row);
+ if (y < (max_luma_height - 1)) {
+ src += stride << 1;
+ }
+
+ row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ y += 2;
+ } while (y < max_luma_height);
+
+ row_shifted =
+ vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+ for (; y < block_height; y += 2) {
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ }
+
+ sum = SumVector(running_sum);
+ } else if (block_width == 8) {
+ const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
+ const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
+ const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);
+
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
+ const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);
+
+ const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));
+
+ if (y < max_luma_height - 1) {
+ src += stride;
+ }
+ }
+
+ sum = SumVector(running_sum);
+ } else /* block_width >= 16 */ {
+ const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
+ uint32x4_t running_sum = vdupq_n_u32(0);
+
+ for (int y = 0; y < block_height; ++y) {
+ uint8x16_t x_index = {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15};
+ const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
+ for (int x = 0; x < block_width; x += 16) {
+ const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);
+
+ const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
+ const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted_low);
+ running_sum = vpadalq_u16(running_sum, row_shifted_high);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
+ vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));
+
+ x_index = vaddq_u8(x_index, vdupq_n_u8(16));
+ }
+ if (y < max_luma_height - 1) {
+ src += stride;
+ }
+ }
+ sum = SumVector(running_sum);
+ }
+
+ const uint32_t average = Average(sum, block_width, block_height);
+ BlockSubtract(average, luma, block_width, block_height);
+}
+
+// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
+inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
+ const int16x8_t dc) {
+ const int16x8_t la = vmulq_n_s16(luma, alpha);
+ // Subtract the sign bit to round towards zero.
+ const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
+ // Shift and accumulate.
+ const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
+ return vqmovun_s16(result);
+}
+
+// The range of luma/alpha is not really important because it gets saturated to
+// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
+template <int block_height>
+inline void CflIntraPredictor4xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const uint8x8_t sum =
+ Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
+ StoreLo4(dst, sum);
+ dst += stride;
+ StoreHi4(dst, sum);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor8xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint8x8_t sum = Combine8(luma_row, alpha, dc);
+ vst1_u8(dst, sum);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor16xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+ const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+ vst1_u8(dst, sum_0);
+ vst1_u8(dst + 8, sum_1);
+ dst += stride;
+ }
+}
+
+template <int block_height>
+inline void CflIntraPredictor32xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+ const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+ const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
+ const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
+ vst1_u8(dst, sum_0);
+ vst1_u8(dst + 8, sum_1);
+ vst1_u8(dst + 16, sum_2);
+ vst1_u8(dst + 24, sum_3);
+ dst += stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 8>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<4, 16>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 4>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 8>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 16>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<8, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 8>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 16>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<16, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 8>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 16>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_NEON<32, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 8>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<4, 16>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 4>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 8>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 16>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<8, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 8>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 16>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<16, 32>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 8>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 16>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_NEON<32, 32>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+ const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+ const uint32x4_t b = vdupq_lane_u32(a, 1);
+ return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+ const uint16x4_t a = vget_high_u16(row);
+ const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+ return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+ const int16x4_t a = vget_high_s16(row);
+ const int16x8_t b = vdupq_lane_s16(a, 0x3);
+ return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+ vst1_s16(luma_ptr + kCflLumaBufferStride,
+ vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+ return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+ return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint16x4_t sum = vdup_n_u16(0);
+ uint16x4_t samples[2];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1_u16(src);
+ samples[1] = vld1_u16(src + src_stride);
+ src += src_stride << 1;
+ sum = vadd_u16(sum, samples[0]);
+ sum = vadd_u16(sum, samples[1]);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples[1] = vshl_n_u16(samples[1], 1);
+ do {
+ sum = vadd_u16(sum, samples[1]);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x4_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1_s16(ssrc);
+ ssample = vshl_n_s16(ssample, 3);
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples;
+ int y = visible_height;
+
+ do {
+ samples = vld1q_u16(src);
+ src += src_stride;
+ sum = vpadalq_u16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = vpadalq_u16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(sum), block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1q_s16(ssrc);
+ ssample = vshlq_n_s16(ssample, 3);
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1q_u16(src);
+ samples[1] =
+ (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+ : LastRowResult(samples[2]);
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ sum = vpadalq_u16(sum, inner_sum);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ do {
+ sum = vpadalq_u16(sum, inner_sum);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(sum), block_width_log2 + block_height_log2 - 3);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssamples_ext = vdupq_n_s16(0);
+ int16x8_t ssamples[4];
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ ssamples[idx] = vld1q_s16(&ssrc[x]);
+ ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+ ssamples_ext = ssamples[idx];
+ } else {
+ ssamples[idx] = LastRowResult(ssamples_ext);
+ }
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row0 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row1 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+ const uint16x8_t samples_row2 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row3 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+ uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const uint16x8_t samples_row4 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row5 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+ const uint16x8_t samples_row6 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row7 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+ sum =
+ vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ const uint16x4_t final_fill =
+ vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x4_t samples = vld1_s16(luma_ptr);
+ vst1_s16(luma_ptr, vsub_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const uint16x8_t samples_row10 = vld1q_u16(src);
+ const uint16x8_t samples_row11 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+ uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row20 = vld1q_u16(src);
+ const uint16x8_t samples_row21 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const uint16x8_t samples_row30 = vld1q_u16(src);
+ const uint16x8_t samples_row31 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+ const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row40 = vld1q_u16(src);
+ const uint16x8_t samples_row41 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const uint16x8_t samples_row50 = vld1q_u16(src);
+ const uint16x8_t samples_row51 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+ const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row60 = vld1q_u16(src);
+ const uint16x8_t samples_row61 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const uint16x8_t samples_row70 = vld1q_u16(src);
+ const uint16x8_t samples_row71 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+ const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const uint16x8_t final_fill =
+ vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum =
+ vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+ for (y = luma_height; y < block_height; ++y) {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ uint16x8_t final_fill0, final_fill1;
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width >= 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ const uint16x8_t samples_row02 = (max_luma_width >= 24)
+ ? vld1q_u16(src + 16)
+ : LastRowSamples(samples_row01);
+ const uint16x8_t samples_row03 = (max_luma_width == 32)
+ ? vld1q_u16(src + 24)
+ : LastRowSamples(samples_row02);
+ const uint16x8_t samples_row10 = vld1q_u16(src_next);
+ const uint16x8_t samples_row11 = (max_luma_width >= 16)
+ ? vld1q_u16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const uint16x8_t samples_row12 = (max_luma_width >= 24)
+ ? vld1q_u16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const uint16x8_t samples_row13 = (max_luma_width == 32)
+ ? vld1q_u16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+ const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+ const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+ final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+ final_sum = vpadalq_u16(final_sum, sum);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const uint16x8_t wide_fill = LastRowResult(final_fill1);
+ final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ uint32x4_t wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit. (a << 2) = (a + a) << 1.
+ wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+ }
+ const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+ const uint32x4_t final_fill_to_sum = vaddl_u16(
+ vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+ do {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+ vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+ if (block_width_log2 == 5) {
+ final_sum = vaddq_u32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_width_log2 + block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples0 = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+ const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+ const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+ vst1q_s16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const int16x8_t wide_fill = LastRowResult(final_row_result);
+ vst1q_s16(luma_ptr + 16, wide_fill);
+ vst1q_s16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+ const int16x8_t alpha_signed, const int16x8_t dc,
+ const uint16x8_t max_value) {
+ const int16x8_t luma_abs = vabsq_s16(luma);
+ const int16x8_t luma_alpha_sign =
+ vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+ // (alpha * luma) >> 6
+ const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+ // Convert back to signed values.
+ const int16x8_t la =
+ vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+ const int16x8_t result = vaddq_s16(la, dc);
+ const int16x8_t zero = vdupq_n_s16(0);
+ // Clip.
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+ const uint16x8_t sum =
+ Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+ vst1_u16(dst, vget_low_u16(sum));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(sum));
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint16x8_t sum =
+ Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_2 =
+ Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_3 =
+ Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ vst1q_u16(dst + 16, sum_2);
+ vst1q_u16(dst + 24, sum_3);
+ dst += dst_stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 5>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+ const uint8x8_t a_weight,
+ const uint8x8_t b_weight) {
+ const uint16x8_t a_product = vmull_u8(a, a_weight);
+ const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
+
+ return vrshrn_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// For vertical operations the weights are one constant value.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+ const uint8_t weight) {
+ return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
+}
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step, const uint8x8_t right_step,
+ uint8x8_t* left, uint8x8_t* right) {
+ const uint8x16_t mixed = vld1q_u8(source);
+ *left = VQTbl1U8(mixed, left_step);
+ *right = VQTbl1U8(mixed, right_step);
+}
+
+// Handle signed step arguments by ignoring the sign. Negative values are
+// considered out of range and overwritten later.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+ const int8x8_t left_step, const int8x8_t right_step,
+ uint8x8_t* left, uint8x8_t* right) {
+ LoadStepwise(source, vreinterpret_u8_s8(left_step),
+ vreinterpret_u8_s8(right_step), left, right);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top,
+ const int xstep, const bool upsampled) {
+ assert(width == 4 || width == 8);
+
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (width + height - 1) << upsample_shift;
+ const int8x8_t max_base = vdup_n_s8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+ const int8x8_t all = vcreate_s8(0x0706050403020100);
+ const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+ const int8x8_t base_step = upsampled ? even : all;
+ const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+ return;
+ }
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+ // Zone2 uses negative values for xstep. Use signed values to compare
+ // |top_base_x| to |max_base_x|.
+ const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+ const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+ // 4 wide subsamples the output. 8 wide subsamples the input.
+ if (width == 4) {
+ const uint8x8_t left_values = vld1_u8(top + top_base_x);
+ const uint8x8_t right_values = RightShiftVector<8>(left_values);
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint8x8_t value_stepped =
+ vtbl1_u8(value, vreinterpret_u8_s8(base_step));
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value_stepped, top_max_base);
+
+ StoreLo4(dst, masked_value);
+ } else /* width == 8 */ {
+ uint8x8_t left_values, right_values;
+ // WeightedBlend() steps up to Q registers. Downsample the input to avoid
+ // doing extra calculations.
+ LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
+ &right_values);
+
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst, masked_value);
+ }
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top,
+ const int xstep, const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (width + height - 1) << upsample_shift;
+ const int8x8_t max_base = vdup_n_s8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+ const int8x8_t all = vcreate_s8(0x0706050403020100);
+ const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+ const int8x8_t base_step = upsampled ? even : all;
+ const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+ const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+ return;
+ }
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+ // Zone2 uses negative values for xstep. Use signed values to compare
+ // |top_base_x| to |max_base_x|.
+ int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+ int x = 0;
+ do {
+ const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+ // Extract the input values based on |upsampled| here to avoid doing twice
+ // as many calculations.
+ uint8x8_t left_values, right_values;
+ LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
+ &right_values);
+
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst + x, masked_value);
+
+ base_v = vadd_s8(base_v, block_step);
+ x += 8;
+ } while (x < width);
+ top_x += xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const int width,
+ const int height, const int xstep, const bool upsampled_top) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ assert(xstep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint8_t* top_ptr = top + 1;
+ int y = 0;
+ do {
+ memcpy(dst, top_ptr, width);
+ memcpy(dst + stride, top_ptr + 1, width);
+ memcpy(dst + 2 * stride, top_ptr + 2, width);
+ memcpy(dst + 3 * stride, top_ptr + 3, width);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y += 4;
+ } while (y < height);
+ } else if (width == 4) {
+ DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
+ } else if (xstep > 51) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
+ // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
+ // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
+ // Shallower angles have greater xstep values.
+ assert(!upsampled_top);
+ const int max_base_x = ((width + height) - 1);
+ const uint8x8_t max_base = vdup_n_u8(max_base_x);
+ const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+ const uint8x8_t block_step = vdup_n_u8(8);
+
+ int top_x = xstep;
+ int y = 0;
+ do {
+ const int top_base_x = top_x >> 6;
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
+ int x = 0;
+ // Only calculate a block of 8 when at least one of the output values is
+ // within range. Otherwise it can read off the end of |top|.
+ const int must_calculate_width =
+ std::min(width, max_base_x - top_base_x + 7) & ~7;
+ for (; x < must_calculate_width; x += 8) {
+ const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
+
+ // Since these |xstep| values can not be upsampled the load is
+ // simplified.
+ const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
+ const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
+ const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+ const uint8x8_t masked_value =
+ vbsl_u8(max_base_mask, value, top_max_base);
+
+ vst1_u8(dst + x, masked_value);
+ base_v = vadd_u8(base_v, block_step);
+ }
+ memset(dst + x, top[max_base_x], width - x);
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+ } else {
+ DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
+ }
+}
+
+// Process 4 or 8 |width| by 4 or 8 |height|.
+template <int width>
+inline void DirectionalZone3_WxH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
+ const int ystep, const int upsample_shift) {
+ assert(width == 4 || width == 8);
+ assert(height == 4 || height == 8);
+ const int scale_bits = 6 - upsample_shift;
+
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+
+ // Limited improvement for 8x8. ~20% faster for 64x64.
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step = upsample_shift ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+ uint8_t* dst = dest;
+ uint8x8_t left_v[8], right_v[8], value_v[8];
+ const uint8_t* const left = left_column;
+
+ const int index_0 = base_left_y;
+ LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
+ &left_v[0], &right_v[0]);
+ value_v[0] = WeightedBlend(left_v[0], right_v[0],
+ ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_1 = base_left_y + ystep;
+ LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
+ &left_v[1], &right_v[1]);
+ value_v[1] = WeightedBlend(left_v[1], right_v[1],
+ ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_2 = base_left_y + ystep * 2;
+ LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
+ &left_v[2], &right_v[2]);
+ value_v[2] = WeightedBlend(left_v[2], right_v[2],
+ ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_3 = base_left_y + ystep * 3;
+ LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
+ &left_v[3], &right_v[3]);
+ value_v[3] = WeightedBlend(left_v[3], right_v[3],
+ ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_4 = base_left_y + ystep * 4;
+ LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
+ &left_v[4], &right_v[4]);
+ value_v[4] = WeightedBlend(left_v[4], right_v[4],
+ ((index_4 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_5 = base_left_y + ystep * 5;
+ LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
+ &left_v[5], &right_v[5]);
+ value_v[5] = WeightedBlend(left_v[5], right_v[5],
+ ((index_5 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_6 = base_left_y + ystep * 6;
+ LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
+ &left_v[6], &right_v[6]);
+ value_v[6] = WeightedBlend(left_v[6], right_v[6],
+ ((index_6 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_7 = base_left_y + ystep * 7;
+ LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
+ &left_v[7], &right_v[7]);
+ value_v[7] = WeightedBlend(left_v[7], right_v[7],
+ ((index_7 << upsample_shift) & 0x3F) >> 1);
+
+ // 8x8 transpose.
+ const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
+ vcombine_u8(value_v[1], value_v[5]));
+ const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
+ vcombine_u8(value_v[3], value_v[7]));
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ if (width == 4) {
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+ if (height == 4) return;
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+ } else {
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+ if (height == 4) return;
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+ dst += stride;
+ vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+ }
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone2FromLeftCol_WxH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+ const int upsample_shift) {
+ assert(width == 4 || width == 8);
+
+ // The shift argument must be a constant.
+ int16x8_t offset_y, shift_upsampled = left_y;
+ if (upsample_shift) {
+ offset_y = vshrq_n_s16(left_y, 5);
+ shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshrq_n_s16(left_y, 6);
+ }
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the right.
+ // With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by doing
+ // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+ // registers as input which would allow for cumulative offsets of 32.
+ const int16x8_t sampler =
+ vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
+ const uint8x8_t left_values = vqmovun_s16(sampler);
+ const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
+
+ const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+ const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
+ const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
+
+ int y = 0;
+ do {
+ uint8x8_t src_left, src_right;
+ LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
+ left_values, right_values, &src_left, &src_right);
+ const uint8x8_t val =
+ WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
+
+ if (width == 4) {
+ StoreLo4(dst, val);
+ } else {
+ vst1_u8(dst, val);
+ }
+ dst += stride;
+ } while (++y < height);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1Blend_WxH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep, const int upsample_shift) {
+ assert(width == 4 || width == 8);
+
+ const int scale_bits_x = 6 - upsample_shift;
+
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step = upsample_shift ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+ int y = 0;
+ do {
+ const uint8_t* const src = top_row + (top_x >> scale_bits_x);
+ uint8x8_t left, right;
+ LoadStepwise(src, base_step, right_step, &left, &right);
+
+ const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint8x8_t val = WeightedBlend(left, right, shift);
+
+ uint8x8_t dst_blend = vld1_u8(dest);
+ // |zone_bounds| values can be negative.
+ uint8x8_t blend =
+ vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
+ uint8x8_t output = vbsl_u8(blend, val, dst_blend);
+
+ if (width == 4) {
+ StoreLo4(dest, output);
+ } else {
+ vst1_u8(dest, output);
+ }
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ } while (++y < height);
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in
+// sections that permit it.
+inline void DirectionalZone2_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const bool upsampled_top,
+ const bool upsampled_left) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Helper vector.
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
+ // it's only 4, it will be finished in the first iteration.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. The following values need the full ystep as a relative offset.
+ const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ if (min_top_only_x > 0) {
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min((4 << 6) / xstep, height);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ // +8 increment is OK because if height is 4 this only goes once.
+ for (; y < min_left_only_y;
+ y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_WxH<4>(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ left_y, upsample_left_shift);
+
+ DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+ xstep_bounds, top_x, xstep,
+ upsample_top_shift);
+ }
+
+ // Loop over y for left_only rows.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < height; y += 8, dst += stride8) {
+ DirectionalZone3_WxH<4>(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ base_left_y, -ystep, upsample_left_shift);
+ }
+ } else {
+ DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
+ upsampled_top);
+ }
+}
+
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y,
+ const bool upsampled_top, const bool upsampled_left) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ // Cover 8x4 case.
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+ } else {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ const int ystep8 = ystep << 3;
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+ int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+ // For ystep > 90, at least two sets of 8 columns can be fully computed from
+ // top_row only.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ int x = 0;
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+ xstep, ystep, x, left_offset, xstep_bounds_base,
+ left_y, upsampled_top, upsampled_left);
+ }
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+ ystep, x, left_offset, xstep_bounds_base, left_y,
+ upsampled_top, upsampled_left);
+ }
+ if (x < width) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ DirectionalZone1_WxH(dst + x, stride, width - x, height,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0, sizeof(top_buffer));
+ memset(left_buffer, 0, sizeof(left_buffer));
+#endif // LIBGAV1_MSAN
+
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ if (width == 4) {
+ DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
+ upsampled_top, upsampled_left);
+ } else {
+ DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+ ystep, upsampled_top, upsampled_left);
+ }
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int ystep, const bool upsampled_left) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+
+ if (width == 4 || height == 4) {
+ // This block can handle all sizes but the specializations for other sizes
+ // are faster.
+ const uint8x8_t all = vcreate_u8(0x0706050403020100);
+ const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+ const uint8x8_t base_step_v = upsampled_left ? even : all;
+ const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ auto* dst = static_cast<uint8_t*>(dest);
+ dst += y * stride + x;
+ uint8x8_t left_v[4], right_v[4], value_v[4];
+ const int ystep_base = ystep * x;
+ const int offset = y * base_step;
+
+ const int index_0 = ystep_base + ystep * 1;
+ LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
+ right_step, &left_v[0], &right_v[0]);
+ value_v[0] = WeightedBlend(left_v[0], right_v[0],
+ ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_1 = ystep_base + ystep * 2;
+ LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
+ right_step, &left_v[1], &right_v[1]);
+ value_v[1] = WeightedBlend(left_v[1], right_v[1],
+ ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_2 = ystep_base + ystep * 3;
+ LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
+ right_step, &left_v[2], &right_v[2]);
+ value_v[2] = WeightedBlend(left_v[2], right_v[2],
+ ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+ const int index_3 = ystep_base + ystep * 4;
+ LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
+ right_step, &left_v[3], &right_v[3]);
+ value_v[3] = WeightedBlend(left_v[3], right_v[3],
+ ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+ // 8x4 transpose.
+ const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
+ const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
+ vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
+ vreinterpret_u16_u8(b1.val[1]));
+
+ StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
+ dst += stride;
+ StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
+
+ if (height > 4) {
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
+ dst += stride;
+ StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
+ }
+ x += 4;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+ } else { // 8x8 at a time.
+ // Limited improvement for 8x8. ~20% faster for 64x64.
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ auto* dst = static_cast<uint8_t*>(dest);
+ dst += y * stride + x;
+ const int ystep_base = ystep * (x + 1);
+
+ DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
+ ystep_base, ystep, upsample_shift);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const int a_weight, const int b_weight) {
+ const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+ const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16_t a_weight,
+ const uint16_t b_weight) {
+ const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t a_weight,
+ const uint16x8_t b_weight) {
+ const uint16x8_t a_product = vmulq_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2_u16(source);
+ } else {
+ dest->val[0] = vld1_u16(source);
+ dest->val[1] = vld1_u16(source + 1);
+ }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2q_u16(source);
+ } else {
+ dest->val[0] = vld1q_u16(source);
+ dest->val[1] = vld1q_u16(source + 1);
+ }
+}
+
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source_low,
+ const uint16_t* LIBGAV1_RESTRICT const source_high,
+ const bool upsampled) {
+ if (upsampled) {
+ const uint16x4x2_t low = vld2_u16(source_low);
+ const uint16x4x2_t high = vld2_u16(source_high);
+ dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+ dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+ } else {
+ dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+ dest->val[1] =
+ vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (4 + height - 1) << upsample_shift;
+ const int16x4_t max_base = vdup_n_s16(max_base_x);
+ const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+ const int16x4_t index_offset = {0, 1, 2, 3};
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+ const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+ uint16x4x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x4_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint16x4_t masked_result =
+ vbsl_u16(max_base_mask, combined, final_top_val);
+
+ vst1_u16(dst, masked_result);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const int xstep) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ do {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+
+ base_x = vaddq_s16(base_x, block_step);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const int xstep, const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ int x = 0;
+ do {
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ vst1q_u16(dst + x, combined);
+
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+ ~7;
+ for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+ base_x = vaddq_s16(base_x, block_step)) {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+ }
+ // Corner-only section of the row.
+ Memset(dst + x, top[max_base_index], width - x);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const int width,
+ const int height, const int xstep, const bool upsampled_top) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ auto* dst = static_cast<uint16_t*>(dest);
+ stride /= sizeof(top[0]);
+
+ assert(xstep > 0);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint16_t* top_ptr = top + 1;
+ const int width_bytes = width * sizeof(top[0]);
+ int y = height;
+ do {
+ memcpy(dst, top_ptr, width_bytes);
+ memcpy(dst + stride, top_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ } else {
+ if (width == 4) {
+ if (upsampled_top) {
+ DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+ } else {
+ DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+ }
+ } else if (width >= 32) {
+ if (upsampled_top) {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+ } else {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+ }
+ } else if (upsampled_top) {
+ DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+ } else {
+ DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30 00 01 02 03
+// 01 11 21 31 10 11 12 13
+// 02 12 22 32 20 21 22 23
+// 03 13 23 33 30 31 32 33
+// ----------- --> -----------
+// 40 50 60 70 40 41 42 43
+// 41 51 61 71 50 51 52 53
+// 42 52 62 72 60 61 62 63
+// 43 53 63 73 70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x4_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x4x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x4(result);
+ Store4(dst, result[0]);
+ dst += stride;
+ Store4(dst, result[1]);
+ dst += stride;
+ Store4(dst, result[2]);
+ dst += stride;
+ Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const uint16x8_t inverter = vdupq_n_u16(32);
+
+ uint16x8x2_t sampled_left_col;
+ // Compute two columns at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ // The low half of pre-transpose vectors contains columns 0 through 3.
+ int left_y_low = base_left_y + ystep;
+ int left_offset_low = left_y_low >> index_scale_bits;
+ int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ // The high half of pre-transpose vectors contains columns 4 through 7.
+ int left_y_high = left_y_low + (ystep << 2);
+ int left_offset_high = left_y_high >> index_scale_bits;
+ int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ uint16x8_t weights_0 =
+ vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_high += ystep;
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ Transpose4x8(result);
+ Store8(dst, result[0]);
+ dst += stride;
+ Store8(dst, result[1]);
+ dst += stride;
+ Store8(dst, result[2]);
+ dst += stride;
+ Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x8x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x8(result);
+ Store4(dst, vget_low_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[3]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep) {
+ assert(height == 8 || height == 16);
+ const int upsample_shift = static_cast<int>(upsampled);
+ DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+ if (height == 16) {
+ dest += stride << 3;
+ DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
+ ystep);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int width,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep) {
+ assert(width <= 16);
+ if (width == 4) {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+ return;
+ }
+ DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+ if (width == 16) {
+ const int base_left_y = ystep << 3;
+ DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+ ystep, base_left_y);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[8];
+
+ int left_y = base_left_y + ystep;
+ uint16x8x2_t sampled_left_col;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose8x8(result);
+ Store8(dest, result[0]);
+ dest += stride;
+ Store8(dest, result[1]);
+ dest += stride;
+ Store8(dest, result[2]);
+ dest += stride;
+ Store8(dest, result[3]);
+ dest += stride;
+ Store8(dest, result[4]);
+ dest += stride;
+ Store8(dest, result[5]);
+ dest += stride;
+ Store8(dest, result[6]);
+ dest += stride;
+ Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> (6 - upsample_shift)) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+ int y = 0;
+ do {
+ int x = 0;
+ uint8_t* dst_x = dest + y * stride;
+ do {
+ const int base_left_y = ystep * x;
+ DirectionalZone3_8x8<upsampled>(
+ dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+ dst_x += 8 * sizeof(uint16_t);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int ystep, const bool upsampled_left) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ if (ystep == 64) {
+ assert(!upsampled_left);
+ const int width_bytes = width * sizeof(left[0]);
+ int y = height;
+ do {
+ const uint16_t* left_ptr = left + 1;
+ memcpy(dst, left_ptr, width_bytes);
+ memcpy(dst + stride, left_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ left_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+ if (height == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ } else {
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ }
+ } else if (width == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ } else {
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ }
+ } else {
+ if (upsampled_left) {
+ // |upsampled_left| can only be true if |width| + |height| <= 16,
+ // therefore this is 8x8.
+ DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+ } else {
+ DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone2
+// This function deals with cases not found in zone 1 or zone 3. The extreme
+// angles are 93, which makes for sharp ascents along |left_column| with each
+// successive dest row element until reaching |top_row|, and 177, with a shallow
+// ascent up |left_column| until reaching large jumps along |top_row|. In the
+// extremely steep cases, source vectors can only be loaded one lane at a time.
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step, const uint8x8_t right_step,
+ uint16x4_t* left, uint16x4_t* right) {
+ const uint8x16x2_t mixed = {
+ vld1q_u8(static_cast<const uint8_t*>(source)),
+ vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+ *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
+ *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
+}
+
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step_0,
+ const uint8x8_t right_step_0,
+ const uint8x8_t left_step_1,
+ const uint8x8_t right_step_1, uint16x8_t* left,
+ uint16x8_t* right) {
+ const uint8x16x2_t mixed = {
+ vld1q_u8(static_cast<const uint8_t*>(source)),
+ vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+ const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
+ const uint16x4_t left_high =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
+ *left = vcombine_u16(left_low, left_high);
+ const uint16x4_t right_low =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
+ const uint16x4_t right_high =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
+ *right = vcombine_u16(right_low, right_high);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const uint16x4_t a_weight,
+ const uint16x4_t b_weight) {
+ const uint16x4_t a_product = vmul_u16(a, a_weight);
+ const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative in localized functions.
+// This is accommodated by making sure the relative indices are within [-15, 0]
+// when the function is called, and sliding them into the inclusive range
+// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
+// the byte offset for table lookups.
+
+constexpr int kPositiveIndexOffsetPixels = 15;
+constexpr int kPositiveIndexOffsetBytes = 30;
+
+inline void DirectionalZone2FromLeftCol_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
+ const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+
+ const int index_scale_bits = 6;
+ // The values in |offset_y| are negative, except for the first element, which
+ // is zero.
+ int16x4_t offset_y;
+ int16x4_t shift_upsampled = left_y;
+ // The shift argument must be a constant, otherwise use upsample_shift
+ // directly.
+ if (upsampled) {
+ offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
+ shift_upsampled = vshl_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshr_n_s16(left_y, index_scale_bits);
+ }
+ offset_y = vshl_n_s16(offset_y, 1);
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the
+ // right. With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by
+ // doing separate loads for |left_values| and |right_values|. vtbl
+ // supports 2 Q registers as input which would allow for cumulative
+ // offsets of 32.
+ // |sampler_0| indexes the first byte of each 16-bit value.
+ const int16x4_t sampler_0 =
+ vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
+ // |sampler_1| indexes the second byte of each 16-bit value.
+ const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
+ const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
+ const uint8x8_t left_indices =
+ vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
+ const uint8x8_t right_indices =
+ vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
+
+ const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
+ const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
+ const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
+
+ int y = 0;
+ do {
+ uint16x4_t src_left, src_right;
+ LoadStepwise(
+ left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+ left_indices, right_indices, &src_left, &src_right);
+ const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+ Store4(dst, val);
+ dst += stride;
+ } while (++y < height);
+}
+
+inline void DirectionalZone2FromLeftCol_8x8(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+ const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+
+ const int index_scale_bits = 6;
+ // The values in |offset_y| are negative, except for the first element, which
+ // is zero.
+ int16x8_t offset_y;
+ int16x8_t shift_upsampled = left_y;
+ // The shift argument must be a constant, otherwise use upsample_shift
+ // directly.
+ if (upsampled) {
+ offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
+ shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshrq_n_s16(left_y, index_scale_bits);
+ }
+ offset_y = vshlq_n_s16(offset_y, 1);
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the right.
+ // With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by doing
+ // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+ // registers as input which would allow for cumulative offsets of 32.
+ // |sampler_0| indexes the first byte of each 16-bit value.
+ const int16x8_t sampler_0 =
+ vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
+ // |sampler_1| indexes the second byte of each 16-bit value.
+ const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
+ const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
+ const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
+ const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
+ const uint8x8_t right_values_0 =
+ vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
+ const uint8x8_t right_values_1 =
+ vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
+
+ const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+ const uint16x8_t shift_0 =
+ vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
+ const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
+
+ for (int y = 0; y < 8; ++y) {
+ uint16x8_t src_left, src_right;
+ LoadStepwise(
+ left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+ left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
+ &src_right);
+ const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+ Store8(dst, val);
+ dst += stride;
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_4xH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ // Representing positions along the row, which |zone_bounds| will target for
+ // the blending boundary.
+ const int16x4_t indices = {0, 1, 2, 3};
+
+ uint16x4x2_t top_vals;
+ int y = height;
+ do {
+ const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+ LoadEdgeVals(&top_vals, src, upsampled);
+
+ const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ const uint16x4_t val =
+ WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+ const uint16x4_t dst_blend = Load4U16(dest);
+ // |zone_bounds| values can be negative.
+ const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
+ const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
+
+ Store4(dest, output);
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ } while (--y != 0);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_8x8(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ // Representing positions along the row, which |zone_bounds| will target for
+ // the blending boundary.
+ const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ uint16x8x2_t top_vals;
+ for (int y = 0; y < 8; ++y) {
+ const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+ LoadEdgeVals(&top_vals, src, upsampled);
+
+ const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ const uint16x8_t val =
+ WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+ const uint16x8_t dst_blend = Load8U16(dest);
+ // |zone_bounds| values can be negative.
+ const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
+ const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
+
+ Store8(dest, output);
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+
+ // Helper vector for index computation.
+ const int16x4_t zero_to_three = {0, 1, 2, 3};
+
+ // Loop increments for moving by block (4xN). Vertical still steps by 8. If
+ // it's only 4, it will be finished in the first iteration.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. The following values need the full ystep as a relative offset.
+ const int16x4_t left_y =
+ vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
+
+ // This loop treats the 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
+ stride >> 1, max_top_only_y, top_row,
+ -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ // +8 increment is OK because if height is 4 this only runs once.
+ for (; y < min_left_only_y;
+ y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_4xH(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ left_y, upsampled_left);
+
+ DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
+ xstep_bounds, top_x, xstep);
+ }
+
+ // Left-only section. |height| - |y| is assumed equivalent to:
+ // (y == 0) && (height == 4)
+ if (height - y == 4) {
+ DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+ return;
+ }
+ if (y < height) {
+ DirectionalZone3_4xH<upsampled_left>(
+ dst, stride, height - y, left_column + (y << upsample_left_shift),
+ -ystep);
+ }
+}
+
+// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
+// address safety.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_Wx4(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int xstep, const int ystep) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int min_top_only_x = std::min((4 * xstep) >> 6, width);
+ int x = 0;
+ for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+ // Round down to the nearest multiple of 4.
+ const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
+ if (max_top_only_y != 0) {
+ DirectionalZone1_4xH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
+ top_row + (x << upsample_top_shift), -xstep);
+ continue;
+ }
+
+ DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
+ -ystep * x);
+
+ const int min_left_only_y = ((x + 4) << 6) / xstep;
+ if (min_left_only_y != 0) {
+ const int top_x = -xstep;
+ DirectionalZone1Blend_4xH<upsampled_top>(
+ dst_x, stride, 4, top_row + (x << upsample_top_shift),
+ xstep_bounds_base, top_x, xstep);
+ }
+ }
+ // Reached |min_top_only_x|.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
+ top_row + (x << upsample_top_shift), -xstep);
+ }
+}
+
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsampled_left);
+ } else {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+
+ DirectionalZone1Blend_8x8<upsampled_top>(
+ dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+ xstep);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+}
+
+// Process a multiple of 8 |width|.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_NEON(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep) {
+ if (height == 4) {
+ DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, width, xstep, ystep);
+ return;
+ }
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Helper vector.
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int ystep8 = ystep << 3;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+ const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ int16x8_t left_y =
+ vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
+
+ int x = 0;
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
+ }
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
+ }
+ // Reached |min_top_only_x|.
+ if (x < width) {
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
+ top_row + (x << upsample_top_shift), -xstep);
+ }
+}
+
+// At this angle, neither edges are upsampled.
+// |min_width| is either 4 or 8.
+template <int min_width>
+void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int width, const int height) {
+ // y = 0 is more trivial than the other rows.
+ memcpy(dst, top - 1, width * sizeof(top[0]));
+ dst += stride;
+
+ // If |height| > |width|, then there is a point at which top_row is no longer
+ // used in each row.
+ const int min_left_only_y = std::min(width, height);
+
+ int y = 1;
+ do {
+ // Example: If y is 4 (min_width), the dest row starts with left[3],
+ // left[2], left[1], left[0], because the angle points up. Therefore, load
+ // starts at left[0] and is then reversed. If y is 2, the load starts at
+ // left[-2], and is reversed to store left[1], left[0], with negative values
+ // overwritten from |top_row|.
+ const uint16_t* const load_left = left + y - min_width;
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+
+ // Some values will be overwritten when |y| is not a multiple of
+ // |min_width|.
+ if (min_width == 4) {
+ const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
+ vst1_u16(dst16, left_toward_corner);
+ } else {
+ int x = 0;
+ do {
+ const uint16x8_t left_toward_corner =
+ vrev64q_u16(vld1q_u16(load_left - x));
+ vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+ vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+ x += 8;
+ } while (x < y);
+ }
+ // Entering |top|.
+ memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
+ dst += stride;
+ } while (++y < min_left_only_y);
+
+ // Left only.
+ for (; y < height; ++y, dst += stride) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16_t* const load_left = left + y - min_width;
+
+ int x = 0;
+ if (min_width == 4) {
+ const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
+ vst1_u16(dst16 + x, left_toward_corner);
+ } else {
+ do {
+ const uint16x8_t left_toward_corner =
+ vrev64q_u16(vld1q_u16(load_left - x));
+ vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+ vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+ x += 8;
+ } while (x < width);
+ }
+ }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint16_t top_buffer[288];
+ uint16_t left_buffer[288];
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0, sizeof(top_buffer));
+ memset(left_buffer, 0, sizeof(left_buffer));
+#endif // LIBGAV1_MSAN
+ memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
+ 160);
+ const uint16_t* top_ptr = top_buffer + 144;
+ const uint16_t* left_ptr = left_buffer + 144;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ if (width == 4) {
+ if (xstep == 64) {
+ assert(ystep == 64);
+ DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
+ return;
+ }
+ if (upsampled_top) {
+ if (upsampled_left) {
+ DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ } else {
+ DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
+ height, xstep, ystep);
+ }
+ } else if (upsampled_left) {
+ DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ } else {
+ DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ }
+ return;
+ }
+
+ if (xstep == 64) {
+ assert(ystep == 64);
+ DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
+ return;
+ }
+ if (upsampled_top) {
+ if (upsampled_left) {
+ DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ } else {
+ DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ }
+ } else if (upsampled_left) {
+ DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ } else {
+ DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Transpose kFilterIntraTaps and convert the first row to unsigned values.
+//
+// With the previous orientation we were able to multiply all the input values
+// by a single tap. This required that all the input values be in one vector
+// which requires expensive set up operations (shifts, vext, vtbl). All the
+// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but
+// then the shifting, rounding, and clamping was done in GP registers.
+//
+// Switching to unsigned values allows multiplying the 8 bit inputs directly.
+// When one value was negative we needed to vmovl_u8 first so that the results
+// maintained the proper sign.
+//
+// We take this into account when summing the values by subtracting the product
+// of the first row.
+alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
+ {{{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative.
+ {10, 2, 1, 1, 6, 2, 2, 1},
+ {0, 10, 1, 1, 0, 6, 2, 2},
+ {0, 0, 10, 2, 0, 0, 6, 2},
+ {0, 0, 0, 10, 0, 0, 0, 6},
+ {12, 9, 7, 5, 2, 2, 2, 3},
+ {0, 0, 0, 0, 12, 9, 7, 5}},
+ {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative.
+ {16, 0, 0, 0, 16, 0, 0, 0},
+ {0, 16, 0, 0, 0, 16, 0, 0},
+ {0, 0, 16, 0, 0, 0, 16, 0},
+ {0, 0, 0, 16, 0, 0, 0, 16},
+ {10, 6, 4, 2, 0, 0, 0, 0},
+ {0, 0, 0, 0, 10, 6, 4, 2}},
+ {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative.
+ {8, 0, 0, 0, 4, 0, 0, 0},
+ {0, 8, 0, 0, 0, 4, 0, 0},
+ {0, 0, 8, 0, 0, 0, 4, 0},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {16, 16, 16, 16, 0, 0, 0, 0},
+ {0, 0, 0, 0, 16, 16, 16, 16}},
+ {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative.
+ {8, 3, 2, 1, 4, 3, 2, 2},
+ {0, 8, 3, 2, 0, 4, 3, 2},
+ {0, 0, 8, 3, 0, 0, 4, 3},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {10, 6, 4, 2, 3, 4, 4, 3},
+ {0, 0, 0, 0, 10, 6, 4, 3}},
+ {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative.
+ {14, 0, 0, 0, 12, 1, 0, 0},
+ {0, 14, 0, 0, 0, 12, 0, 0},
+ {0, 0, 14, 0, 0, 0, 12, 1},
+ {0, 0, 0, 14, 0, 0, 0, 12},
+ {14, 12, 11, 10, 0, 0, 1, 1},
+ {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column,
+ FilterIntraPredictor pred, int width,
+ int height) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint8x8_t transposed_taps[7];
+ for (int i = 0; i < 7; ++i) {
+ transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]);
+ }
+
+ uint8_t relative_top_left = top[-1];
+ const uint8_t* relative_top = top;
+ uint8_t relative_left[2] = {left[0], left[1]};
+
+ int y = 0;
+ do {
+ uint8_t* row_dst = dst;
+ int x = 0;
+ do {
+ uint16x8_t sum = vdupq_n_u16(0);
+ const uint16x8_t subtrahend =
+ vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left));
+ for (int i = 1; i < 5; ++i) {
+ sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1]));
+ }
+ for (int i = 5; i < 7; ++i) {
+ sum =
+ vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5]));
+ }
+
+ const int16x8_t sum_signed =
+ vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend));
+ const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4);
+
+ uint8x8_t sum_saturated = vqmovun_s16(sum_shifted);
+
+ StoreLo4(row_dst, sum_saturated);
+ StoreHi4(row_dst + stride, sum_saturated);
+
+ // Progress across
+ relative_top_left = relative_top[3];
+ relative_top += 4;
+ relative_left[0] = row_dst[3];
+ relative_left[1] = row_dst[3 + stride];
+ row_dst += 4;
+ x += 4;
+ } while (x < width);
+
+ // Progress down.
+ relative_top_left = left[y + 1];
+ relative_top = dst + stride;
+ relative_left[0] = left[y + 2];
+ relative_left[1] = left[y + 3];
+
+ dst += 2 * stride;
+ y += 2;
+ } while (y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+alignas(kMaxAlignment) constexpr int16_t
+ kTransposedTaps[kNumFilterIntraPredictors][7][8] = {
+ {{-6, -5, -3, -3, -4, -3, -3, -3},
+ {10, 2, 1, 1, 6, 2, 2, 1},
+ {0, 10, 1, 1, 0, 6, 2, 2},
+ {0, 0, 10, 2, 0, 0, 6, 2},
+ {0, 0, 0, 10, 0, 0, 0, 6},
+ {12, 9, 7, 5, 2, 2, 2, 3},
+ {0, 0, 0, 0, 12, 9, 7, 5}},
+ {{-10, -6, -4, -2, -10, -6, -4, -2},
+ {16, 0, 0, 0, 16, 0, 0, 0},
+ {0, 16, 0, 0, 0, 16, 0, 0},
+ {0, 0, 16, 0, 0, 0, 16, 0},
+ {0, 0, 0, 16, 0, 0, 0, 16},
+ {10, 6, 4, 2, 0, 0, 0, 0},
+ {0, 0, 0, 0, 10, 6, 4, 2}},
+ {{-8, -8, -8, -8, -4, -4, -4, -4},
+ {8, 0, 0, 0, 4, 0, 0, 0},
+ {0, 8, 0, 0, 0, 4, 0, 0},
+ {0, 0, 8, 0, 0, 0, 4, 0},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {16, 16, 16, 16, 0, 0, 0, 0},
+ {0, 0, 0, 0, 16, 16, 16, 16}},
+ {{-2, -1, -1, -0, -1, -1, -1, -1},
+ {8, 3, 2, 1, 4, 3, 2, 2},
+ {0, 8, 3, 2, 0, 4, 3, 2},
+ {0, 0, 8, 3, 0, 0, 4, 3},
+ {0, 0, 0, 8, 0, 0, 0, 4},
+ {10, 6, 4, 2, 3, 4, 4, 3},
+ {0, 0, 0, 0, 10, 6, 4, 3}},
+ {{-12, -10, -9, -8, -10, -9, -8, -7},
+ {14, 0, 0, 0, 12, 1, 0, 0},
+ {0, 14, 0, 0, 0, 12, 0, 0},
+ {0, 0, 14, 0, 0, 0, 12, 1},
+ {0, 0, 0, 14, 0, 0, 0, 12},
+ {14, 12, 11, 10, 0, 0, 1, 1},
+ {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column,
+ FilterIntraPredictor pred, int width,
+ int height) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ auto* dst = static_cast<uint16_t*>(dest);
+
+ stride >>= 1;
+
+ int16x8_t transposed_taps[7];
+ for (int i = 0; i < 7; ++i) {
+ transposed_taps[i] = vld1q_s16(kTransposedTaps[pred][i]);
+ }
+
+ uint16_t relative_top_left = top[-1];
+ const uint16_t* relative_top = top;
+ uint16_t relative_left[2] = {left[0], left[1]};
+
+ int y = 0;
+ do {
+ uint16_t* row_dst = dst;
+ int x = 0;
+ do {
+ int16x8_t sum =
+ vmulq_s16(transposed_taps[0],
+ vreinterpretq_s16_u16(vdupq_n_u16(relative_top_left)));
+ for (int i = 1; i < 5; ++i) {
+ sum =
+ vmlaq_s16(sum, transposed_taps[i],
+ vreinterpretq_s16_u16(vdupq_n_u16(relative_top[i - 1])));
+ }
+ for (int i = 5; i < 7; ++i) {
+ sum =
+ vmlaq_s16(sum, transposed_taps[i],
+ vreinterpretq_s16_u16(vdupq_n_u16(relative_left[i - 5])));
+ }
+
+ const int16x8_t sum_shifted = vrshrq_n_s16(sum, 4);
+ const uint16x8_t sum_saturated = vminq_u16(
+ vreinterpretq_u16_s16(vmaxq_s16(sum_shifted, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+
+ vst1_u16(row_dst, vget_low_u16(sum_saturated));
+ vst1_u16(row_dst + stride, vget_high_u16(sum_saturated));
+
+ // Progress across
+ relative_top_left = relative_top[3];
+ relative_top += 4;
+ relative_left[0] = row_dst[3];
+ relative_left[1] = row_dst[3 + stride];
+ row_dst += 4;
+ x += 4;
+ } while (x < width);
+
+ // Progress down.
+ relative_top_left = left[y + 1];
+ relative_top = dst + stride;
+ relative_left[0] = left[y + 2];
+ relative_left[1] = left[y + 3];
+
+ dst += 2 * stride;
+ y += 2;
+ } while (y < height);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredFilterInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_NEON
+
+using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
+ const bool use_ref_1, const void* ref_1,
+ const int ref_1_size_log2);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
+
+// DC intra-predictors for square blocks.
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+struct DcPredFuncs_NEON {
+ DcPredFuncs_NEON() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+ DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* /*left_column*/) {
+ const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
+ const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+ DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const uint32x2_t sum =
+ sumfn(left_column, block_height_log2, false, nullptr, 0);
+ const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+ DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const uint32x2_t sum =
+ sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
+ if (block_width_log2 == block_height_log2) {
+ const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
+ storefn(dest, stride, dc);
+ } else {
+ // TODO(johannkoenig): Compare this to mul/shift in vectors.
+ const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
+ uint32_t dc = vget_lane_u32(sum, 0);
+ dc += divisor >> 1;
+ dc /= divisor;
+ storefn(dest, stride, vdup_n_u32(dc));
+ }
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x4_t val) {
+ const uint32x2_t sum = vpaddl_u16(val);
+ return vpadd_u32(sum, sum);
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x8_t val) {
+ const uint32x4_t sum_0 = vpaddlq_u16(val);
+ const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
+ return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
+ vget_high_u32(vreinterpretq_u32_u64(sum_1)));
+}
+
+} // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
+// entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
+// the entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
+ const uint8x16_t val_2, const uint8x16_t val_3) {
+ const uint16x8_t sum_0 = Add(val_0, val_1);
+ const uint16x8_t sum_1 = Add(val_2, val_3);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 32 uint8_t values.
+inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
+ const uint8x16_t val_0 = vld1q_u8(buf);
+ const uint8x16_t val_1 = vld1q_u8(buf + 16);
+ return Add(val_0, val_1);
+}
+
+// Load and combine 64 uint8_t values.
+inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
+ const uint8x16_t val_0 = vld1q_u8(buf);
+ const uint8x16_t val_1 = vld1q_u8(buf + 16);
+ const uint8x16_t val_2 = vld1q_u8(buf + 32);
+ const uint8x16_t val_3 = vld1q_u8(buf + 48);
+ return Add(val_0, val_1, val_2, val_3);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
+// uint32_t.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+ const int ref_0_size_log2, const bool use_ref_1,
+ const void* LIBGAV1_RESTRICT ref_1,
+ const int ref_1_size_log2) {
+ const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
+ const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
+ if (ref_0_size_log2 == 2) {
+ uint8x8_t val = Load4(ref_0_u8);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 4x4
+ val = Load4<1>(ref_1_u8, val);
+ return Sum(vpaddl_u8(val));
+ }
+ case 3: { // 4x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 4: { // 4x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+ }
+ }
+ }
+ // 4x1
+ const uint16x4_t sum = vpaddl_u8(val);
+ return vpaddl_u16(sum);
+ }
+ if (ref_0_size_log2 == 3) {
+ const uint8x8_t val_0 = vld1_u8(ref_0_u8);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 8x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 3: { // 8x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 4: { // 8x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+ }
+ case 5: { // 8x32
+ return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+ }
+ }
+ }
+ // 8x1
+ return Sum(vpaddl_u8(val_0));
+ }
+ if (ref_0_size_log2 == 4) {
+ const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 16x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ }
+ case 3: { // 16x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ }
+ case 4: { // 16x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(Add(val_0, val_1));
+ }
+ case 5: { // 16x32
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 16x64
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 16x1
+ return Sum(vpaddlq_u8(val_0));
+ }
+ if (ref_0_size_log2 == 5) {
+ const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 3: { // 32x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(sum_0, val_1));
+ }
+ case 4: { // 32x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 32x1
+ return Sum(sum_0);
+ }
+
+ assert(ref_0_size_log2 == 6);
+ const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 4: { // 64x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 64x1
+ return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+ const uint32x2_t dc) {
+ const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ int i = height - 1;
+ do {
+ StoreLo4(dst, vget_low_u8(dc_dup));
+ dst += stride;
+ } while (--i != 0);
+ StoreLo4(dst, vget_low_u8(dc_dup));
+ } else if (width == 8) {
+ int i = height - 1;
+ do {
+ vst1_u8(dst, vget_low_u8(dc_dup));
+ dst += stride;
+ } while (--i != 0);
+ vst1_u8(dst, vget_low_u8(dc_dup));
+ } else if (width == 16) {
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ } else if (width == 32) {
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ } else {
+ assert(width == 64);
+ int i = height - 1;
+ do {
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ vst1q_u8(dst + 32, dc_dup);
+ vst1q_u8(dst + 48, dc_dup);
+ dst += stride;
+ } while (--i != 0);
+ vst1q_u8(dst, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
+ vst1q_u8(dst + 32, dc_dup);
+ vst1q_u8(dst + 48, dc_dup);
+ }
+}
+
+template <int width, int height>
+inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ auto* dest_u8 = static_cast<uint8_t*>(dest);
+ const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+ const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+ const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+ uint8x8_t top;
+ if (width == 4) {
+ top = Load4(top_row_u8);
+ } else { // width == 8
+ top = vld1_u8(top_row_u8);
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
+
+ const uint8x8_t left_dist = vabd_u8(top, top_left);
+ const uint8x8_t top_dist = vabd_u8(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+ const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+ const uint8x8_t left_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+ const uint8x8_t top_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x8_t result = vbsl_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u8(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ StoreLo4(dest_u8, result);
+ } else { // width == 8
+ vst1_u8(dest_u8, result);
+ }
+ dest_u8 += stride;
+ }
+}
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
+ const uint16x8_t top_left_dist_low,
+ const uint16x8_t top_left_dist_high) {
+ const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+ vqmovn_u16(top_left_dist_high));
+ return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
+ const uint8x16_t top_left,
+ const uint8x16_t left_le_top,
+ const uint8x16_t left_le_top_left,
+ const uint8x16_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x16_t result = vbslq_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num) \
+ const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
+ vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+ const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
+ vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num) \
+ const uint8x16_t left_le_top_left_##num = \
+ XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
+ top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num) \
+ const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
+ top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+template <int width, int height>
+inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ auto* dest_u8 = static_cast<uint8_t*>(dest);
+ const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+ const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+ const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+ uint8x16_t top[4];
+ top[0] = vld1q_u8(top_row_u8);
+ if (width > 16) {
+ top[1] = vld1q_u8(top_row_u8 + 16);
+ if (width == 64) {
+ top[2] = vld1q_u8(top_row_u8 + 32);
+ top[3] = vld1q_u8(top_row_u8 + 48);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
+
+ const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+ const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+ TOP_LEFT_DIST(0);
+ const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+ LEFT_LE_TOP_LEFT(0);
+ TOP_LE_TOP_LEFT(0);
+
+ const uint8x16_t result_0 =
+ SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+ top_le_top_left_0);
+ vst1q_u8(dest_u8, result_0);
+
+ if (width > 16) {
+ const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+ TOP_LEFT_DIST(1);
+ const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+ LEFT_LE_TOP_LEFT(1);
+ TOP_LE_TOP_LEFT(1);
+
+ const uint8x16_t result_1 =
+ SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
+ top_le_top_left_1);
+ vst1q_u8(dest_u8 + 16, result_1);
+
+ if (width == 64) {
+ const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+ TOP_LEFT_DIST(2);
+ const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+ LEFT_LE_TOP_LEFT(2);
+ TOP_LE_TOP_LEFT(2);
+
+ const uint8x16_t result_2 =
+ SelectPaeth(top[2], left, top_left, left_2_le_top,
+ left_le_top_left_2, top_le_top_left_2);
+ vst1q_u8(dest_u8 + 32, result_2);
+
+ const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+ TOP_LEFT_DIST(3);
+ const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+ LEFT_LE_TOP_LEFT(3);
+ TOP_LE_TOP_LEFT(3);
+
+ const uint8x16_t result_3 =
+ SelectPaeth(top[3], left, top_left, left_3_le_top,
+ left_le_top_left_3, top_le_top_left_3);
+ vst1q_u8(dest_u8 + 48, result_3);
+ }
+ }
+
+ dest_u8 += stride;
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+ using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+ using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+ using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+ using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+ using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+ using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+ using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+ using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+ using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+ using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+ using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+ using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+ using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+ using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+ using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+ using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+ using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+ using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // 4x4
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<4, 16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth4Or8xN_NEON<8, 32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Paeth16PlusxN_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Add the elements in the given vectors together but do not sum the entire
+// vector.
+inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
+ const uint16x8_t val_2, const uint16x8_t val_3) {
+ const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
+ const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 16 uint16_t values.
+inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ return vaddq_u16(val_0, val_1);
+}
+
+// Load and combine 32 uint16_t values.
+inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ const uint16x8_t val_2 = vld1q_u16(buf + 16);
+ const uint16x8_t val_3 = vld1q_u16(buf + 24);
+ return Add(val_0, val_1, val_2, val_3);
+}
+
+// Load and combine 64 uint16_t values.
+inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
+ const uint16x8_t val_0 = vld1q_u16(buf);
+ const uint16x8_t val_1 = vld1q_u16(buf + 8);
+ const uint16x8_t val_2 = vld1q_u16(buf + 16);
+ const uint16x8_t val_3 = vld1q_u16(buf + 24);
+ const uint16x8_t val_4 = vld1q_u16(buf + 32);
+ const uint16x8_t val_5 = vld1q_u16(buf + 40);
+ const uint16x8_t val_6 = vld1q_u16(buf + 48);
+ const uint16x8_t val_7 = vld1q_u16(buf + 56);
+ const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
+ const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
+ return vaddq_u16(sum_0, sum_1);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+ const int ref_0_size_log2, const bool use_ref_1,
+ const void* LIBGAV1_RESTRICT ref_1,
+ const int ref_1_size_log2) {
+ const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
+ const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
+ if (ref_0_size_log2 == 2) {
+ const uint16x4_t val_0 = vld1_u16(ref_0_u16);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 4x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ return Sum(vadd_u16(val_0, val_1));
+ }
+ case 3: { // 4x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 4x16
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 4x1
+ return Sum(val_0);
+ }
+ if (ref_0_size_log2 == 3) {
+ const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 8x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ case 3: { // 8x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, val_1));
+ }
+ case 4: { // 8x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ case 5: { // 8x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ }
+ }
+ // 8x1
+ return Sum(val_0);
+ }
+ if (ref_0_size_log2 == 4) {
+ const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 2: { // 16x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 3: { // 16x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 16x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 16x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 16x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 16x1
+ return Sum(sum_0);
+ }
+ if (ref_0_size_log2 == 5) {
+ const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 3: { // 32x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 32x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 32x1
+ return Sum(sum_0);
+ }
+
+ assert(ref_0_size_log2 == 6);
+ const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
+ if (use_ref_1) {
+ switch (ref_1_size_log2) {
+ case 4: { // 64x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ }
+ }
+ // 64x1
+ return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+ const uint32x2_t dc) {
+ auto* dest_u16 = static_cast<uint16_t*>(dest);
+ ptrdiff_t stride_u16 = stride >> 1;
+ const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
+ if (width == 4) {
+ int i = height - 1;
+ do {
+ vst1_u16(dest_u16, vget_low_u16(dc_dup));
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1_u16(dest_u16, vget_low_u16(dc_dup));
+ } else if (width == 8) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ } else if (width == 16) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ } else if (width == 32) {
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ } else {
+ assert(width == 64);
+ int i = height - 1;
+ do {
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ vst1q_u16(dest_u16 + 32, dc_dup);
+ vst1q_u16(dest_u16 + 40, dc_dup);
+ vst1q_u16(dest_u16 + 48, dc_dup);
+ vst1q_u16(dest_u16 + 56, dc_dup);
+ dest_u16 += stride_u16;
+ } while (--i != 0);
+ vst1q_u16(dest_u16, dc_dup);
+ vst1q_u16(dest_u16 + 8, dc_dup);
+ vst1q_u16(dest_u16 + 16, dc_dup);
+ vst1q_u16(dest_u16 + 24, dc_dup);
+ vst1q_u16(dest_u16 + 32, dc_dup);
+ vst1q_u16(dest_u16 + 40, dc_dup);
+ vst1q_u16(dest_u16 + 48, dc_dup);
+ vst1q_u16(dest_u16 + 56, dc_dup);
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+ using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+ using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+ using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+ using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+ using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+ using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+ using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+ using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+ using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+ using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+ using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+ using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+ using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+ using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+ using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+ using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+ using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+ using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t row = vld1_dup_u16(left + y);
+ vst1_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t row = vld1q_dup_u16(left + y);
+ vst1q_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ vst1q_u16(dst16 + 16, row0);
+ vst1q_u16(dst16 + 24, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ vst1q_u16(dst16 + 16, row1);
+ vst1q_u16(dst16 + 24, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x8_t row = vld1_u8(top);
+ int y = block_height;
+ do {
+ vst1_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row = vld1q_u8(top);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ const uint8x16_t row4 = vld1q_u8(top + 64);
+ const uint8x16_t row5 = vld1q_u8(top + 80);
+ const uint8x16_t row6 = vld1q_u8(top + 96);
+ const uint8x16_t row7 = vld1q_u8(top + 112);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int height>
+inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
+ const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
+ const uint16x4_t top = vld1_u16(top_row);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t left = vdup_n_u16(left_col[y]);
+
+ const uint16x4_t left_dist = vabd_u16(top, top_left);
+ const uint16x4_t top_dist = vabd_u16(left, top_left);
+ const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
+
+ const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
+ const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
+ const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x4_t result = vbsl_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u16(left_or_top_mask, result, top_left);
+
+ vst1_u16(dst16, result);
+ dst += stride;
+ }
+}
+
+template <int height>
+inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+ const uint16x8_t top = vld1q_u16(top_row);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t left = vdupq_n_u16(left_col[y]);
+
+ const uint16x8_t left_dist = vabdq_u16(top, top_left);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ vst1q_u16(dst16, result);
+ dst += stride;
+ }
+}
+
+// For 16xH and above.
+template <int width, int height>
+inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+
+ uint16x8_t top[width >> 3];
+ for (int i = 0; i < width >> 3; ++i) {
+ top[i] = vld1q_u16(top_row + (i << 3));
+ }
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t left = vdupq_n_u16(left_col[y]);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+
+ for (int i = 0; i < (width >> 3); ++i) {
+ const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ vst1q_u16(dst_x, result);
+ dst_x += 8;
+ }
+ dst += stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Vertical4xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Vertical4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Vertical4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Vertical8xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Vertical8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Vertical8xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Vertical8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Vertical16xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Horizontal16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Vertical16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Vertical16xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Vertical16xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Vertical16xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Vertical32xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Vertical32xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Vertical32xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Horizontal32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Vertical32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Vertical64xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Vertical64xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Vertical64xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
+void IntraPredInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 10 bit
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+template <int height>
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ constexpr int width = 4;
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_v = Load4(top);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+ StoreLo4(dst, result);
+ dst += stride;
+ }
+}
+
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return CalculatePred(weighted_top_bl, weighted_left_tr);
+}
+
+template <int height>
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ constexpr int width = 8;
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_v = vld1_u8(top);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint8x8_t result =
+ CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
+
+ vst1_u8(dst, result);
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateWeightsAndPred(
+ const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+ const uint8x8_t weights_y, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
+
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
+
+ return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+template <int width, int height>
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint8x16_t top_v[4];
+ top_v[0] = vld1q_u8(top);
+ if (width > 16) {
+ top_v[1] = vld1q_u8(top + 16);
+ if (width == 64) {
+ top_v[2] = vld1q_u8(top + 32);
+ top_v[3] = vld1q_u8(top + 48);
+ }
+ }
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ uint8x16_t weights_x_v[4];
+ weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
+ if (width > 16) {
+ weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+ if (width == 64) {
+ weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+ weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+ }
+ }
+
+ uint8x16_t scaled_weights_x[4];
+ scaled_weights_x[0] = NegateS8(weights_x_v[0]);
+ if (width > 16) {
+ scaled_weights_x[1] = NegateS8(weights_x_v[1]);
+ if (width == 64) {
+ scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+ scaled_weights_x[3] = NegateS8(weights_x_v[3]);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+ vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
+ weights_y_v, weights_x_v[0],
+ scaled_weights_x[0], weighted_bl));
+
+ if (width > 16) {
+ vst1q_u8(dst + 16, CalculateWeightsAndPred(
+ top_v[1], left_v, top_right_v, weights_y_v,
+ weights_x_v[1], scaled_weights_x[1], weighted_bl));
+ if (width == 64) {
+ vst1q_u8(dst + 32,
+ CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
+ weights_y_v, weights_x_v[2],
+ scaled_weights_x[2], weighted_bl));
+ vst1q_u8(dst + 48,
+ CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
+ weights_y_v, weights_x_v[3],
+ scaled_weights_x[3], weighted_bl));
+ }
+ }
+
+ dst += stride;
+ }
+}
+
+template <int width, int height>
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint8x8_t top_v;
+ if (width == 4) {
+ top_v = Load4(top);
+ } else { // width == 8
+ top_v = vld1_u8(top);
+ }
+
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
+
+ if (width == 4) {
+ StoreLo4(dst, pred);
+ } else { // width == 8
+ vst1_u8(dst, pred);
+ }
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateVerticalWeightsAndPred(
+ const uint8x16_t top, const uint8x8_t weights_y,
+ const uint16x8_t weighted_bl) {
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, kSmoothWeightScale);
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+void SmoothVertical16PlusxN_NEON(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t bottom_left = left[height - 1];
+ const uint8_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint8x16_t top_v[4];
+ top_v[0] = vld1q_u8(top);
+ if (width > 16) {
+ top_v[1] = vld1q_u8(top + 16);
+ if (width == 64) {
+ top_v[2] = vld1q_u8(top + 32);
+ top_v[3] = vld1q_u8(top + 48);
+ }
+ }
+
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+ const uint8x16_t pred_0 =
+ CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
+ vst1q_u8(dst, pred_0);
+
+ if (width > 16) {
+ const uint8x16_t pred_1 =
+ CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 16, pred_1);
+
+ if (width == 64) {
+ const uint8x16_t pred_2 =
+ CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 32, pred_2);
+
+ const uint8x16_t pred_3 =
+ CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
+ vst1q_u8(dst + 48, pred_3);
+ }
+ }
+
+ dst += stride;
+ }
+}
+
+template <int width, int height>
+void SmoothHorizontal4Or8xN_NEON(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ // Over-reads for 4xN but still within the array.
+ const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x, left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
+
+ if (width == 4) {
+ StoreLo4(dst, pred);
+ } else { // width == 8
+ vst1_u8(dst, pred);
+ }
+ dst += stride;
+ }
+}
+
+inline uint8x16_t CalculateHorizontalWeightsAndPred(
+ const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x) {
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
+
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
+
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+void SmoothHorizontal16PlusxN_NEON(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const uint8_t top_right = top[width - 1];
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+
+ uint8x16_t weights_x[4];
+ weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
+ if (width > 16) {
+ weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+ if (width == 64) {
+ weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+ weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+ }
+ }
+
+ uint8x16_t scaled_weights_x[4];
+ scaled_weights_x[0] = NegateS8(weights_x[0]);
+ if (width > 16) {
+ scaled_weights_x[1] = NegateS8(weights_x[1]);
+ if (width == 64) {
+ scaled_weights_x[2] = NegateS8(weights_x[2]);
+ scaled_weights_x[3] = NegateS8(weights_x[3]);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+ const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
+ vst1q_u8(dst, pred_0);
+
+ if (width > 16) {
+ const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
+ vst1q_u8(dst + 16, pred_1);
+
+ if (width == 64) {
+ const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
+ vst1q_u8(dst + 32, pred_2);
+
+ const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
+ left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
+ vst1q_u8(dst + 48, pred_3);
+ }
+ }
+ dst += stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // 4x4
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4xN_NEON<4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4xN_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4xN_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<4, 16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<4, 16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth8xN_NEON<4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth8xN_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth8xN_NEON<16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth8xN_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical4Or8xN_NEON<8, 32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4Or8xN_NEON<8, 32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Smooth16PlusxN_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16PlusxN_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16PlusxN_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint16_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+template <int height>
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[3];
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4_t top_v = vld1_u16(top);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+ for (int y = 0; y < height; ++y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_top, weights_x_v, left[y]);
+ const uint32x4_t weighted_bl =
+ vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+ const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
+ dst += stride;
+ }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
+ const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high,
+ const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
+ const uint16_t weight_y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+ const uint32x4_t weighted_edges_low =
+ vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+ const uint16x4_t pred_low =
+ vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
+ vst1_u16(dst, pred_low);
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+ const uint32x4_t weighted_edges_high =
+ vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+ const uint16x4_t pred_high =
+ vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
+ vst1_u16(dst + 4, pred_high);
+}
+
+template <int height>
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[7];
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+ vld1_u16(kSmoothWeights + 8)};
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_corners_low =
+ vaddq_u32(weighted_bl, weighted_tr_low);
+ const uint32x4_t weighted_corners_high =
+ vaddq_u32(weighted_bl, weighted_tr_high);
+ CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
+ weighted_corners_high, top_vals, weights_x, left[y],
+ weights_y[y]);
+ dst += stride;
+ }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[width - 1];
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ // Precompute weighted values that don't vary with |y|.
+ uint32x4_t weighted_tr_low[width >> 3];
+ uint32x4_t weighted_tr_high[width >> 3];
+ for (int i = 0; i < width >> 3; ++i) {
+ const int x = i << 3;
+ const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
+ const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
+ }
+
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+ for (int i = 0; i < width >> 3; ++i) {
+ const int x = i << 3;
+ const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+ const uint32x4_t weighted_corners_low =
+ vaddq_u32(weighted_bl, weighted_tr_low[i]);
+ const uint32x4_t weighted_corners_high =
+ vaddq_u32(weighted_bl, weighted_tr_high[i]);
+ // Accumulate weighted edge values and store.
+ const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
+ vld1_u16(kSmoothWeights + width + x)};
+ CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
+ top_vals, weights_x, left[y], weights_y[y]);
+ dst_x += 8;
+ }
+ dst += stride;
+ }
+}
+
+template <int height>
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4_t top_v = vld1_u16(top);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+ vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
+
+ dst += stride;
+ }
+}
+
+template <int height>
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4_t top_low = vld1_u16(top);
+ const uint16x4_t top_high = vld1_u16(top + 4);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+ vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+ vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+ dst += stride;
+ }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t bottom_left = left[height - 1];
+ const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint16x4x2_t top_vals[width >> 3];
+ for (int i = 0; i < width >> 3; ++i) {
+ const int x = i << 3;
+ top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+ }
+
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+ auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+ for (int i = 0; i < width >> 3; ++i) {
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
+ vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
+ vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+ dst_x += 8;
+ }
+ dst += stride;
+ }
+}
+
+template <int height>
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[3];
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x);
+
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_tr, weights_x, left[y]);
+ vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
+ dst += stride;
+ }
+}
+
+template <int height>
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[7];
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+ vld1_u16(kSmoothWeights + 8)};
+
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16_t left_y = left[y];
+ const uint32x4_t weighted_left_low =
+ vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+ vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+ const uint32x4_t weighted_left_high =
+ vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+ vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+ dst += stride;
+ }
+}
+
+// For width 16 and above.
+template <int width, int height>
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ const uint16_t top_right = top[width - 1];
+
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ uint16x4_t weights_x_low[width >> 3];
+ uint16x4_t weights_x_high[width >> 3];
+ uint32x4_t weighted_tr_low[width >> 3];
+ uint32x4_t weighted_tr_high[width >> 3];
+ for (int i = 0; i < width >> 3; ++i) {
+ const int x = i << 3;
+ weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
+ weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
+ }
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+ const uint16_t left_y = left[y];
+ for (int i = 0; i < width >> 3; ++i) {
+ const uint32x4_t weighted_left_low =
+ vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
+ vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+ const uint32x4_t weighted_left_high =
+ vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
+ vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+ dst_x += 8;
+ }
+ dst += stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ // 4x4
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4xH_NEON<4>;
+
+ // 4x8
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4xH_NEON<8>;
+
+ // 4x16
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4xH_NEON<16>;
+
+ // 8x4
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth8xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical8xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8xH_NEON<4>;
+
+ // 8x8
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8xH_NEON<8>;
+
+ // 8x16
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth8xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical8xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8xH_NEON<16>;
+
+ // 8x32
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8xH_NEON<32>;
+
+ // 16x4
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<16, 4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<16, 4>;
+
+ // 16x8
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<16, 8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<16, 8>;
+
+ // 16x16
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<16, 16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<16, 16>;
+
+ // 16x32
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<16, 32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<16, 32>;
+
+ // 16x64
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<16, 64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<16, 64>;
+
+ // 32x8
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<32, 8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<32, 8>;
+
+ // 32x16
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<32, 16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<32, 16>;
+
+ // 32x32
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<32, 32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<32, 32>;
+
+ // 32x64
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<32, 64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<32, 64>;
+
+ // 64x16
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<64, 16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<64, 16>;
+
+ // 64x32
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<64, 32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<64, 32>;
+
+ // 64x64
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ SmoothWxH_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVerticalWxH_NEON<64, 64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontalWxH_NEON<64, 64>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredSmoothInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+// 10bpp
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+ int32x4_t out[4]) {
+ // in:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // 00 10 02 12 a.val[0]
+ // 01 11 03 13 a.val[1]
+ // 20 30 22 32 b.val[0]
+ // 21 31 23 33 b.val[1]
+ const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+ const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+ out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+ out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+ out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+ out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+ // out:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* LIBGAV1_RESTRICT dst,
+ int32_t stride, int32_t idx,
+ const int32x4_t* const s) {
+ assert(store_count % 4 == 0);
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s32(&dst[i * stride + idx], s[i]);
+ vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+ vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+ vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* LIBGAV1_RESTRICT src,
+ int32_t stride, int32_t idx, int32x4_t* x) {
+ assert(load_count % 4 == 0);
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s32(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+ const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+ // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+ // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+ // bit lane.
+ const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+ const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ assert(sin128 <= 0xfff);
+ const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+ const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+ const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip, const int32x4_t min,
+ const int32x4_t max) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = vmaxq_s32(vminq_s32(x, max), min);
+ *b = vmaxq_s32(vminq_s32(y, max), min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+ const int32_t cos128 = Cos128(32);
+ const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+ // vqrshlq_s32 will shift right if shift value is negative.
+ const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+ // Clamp result to signed 16 bits.
+ const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+ if (width == 4) {
+ vst1q_s32(dst, result);
+ } else {
+ for (int i = 0; i < width; i += 4) {
+ vst1q_s32(dst, result);
+ dst += 4;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int32x4_t v_src = vld1q_s32(dst);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(&dst[i], xy);
+ i += 4;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
+ const bool is_last_stage) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+ } else {
+ HadamardRotation(&s[0], &s[3], false, min, max);
+ HadamardRotation(&s[1], &s[2], false, min, max);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ // When |is_row| is true, set range to the row range, otherwise, set to the
+ // column range.
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[4], x[4];
+
+ if (is_row) {
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+ }
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
+ const bool is_last_stage) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false, min, max);
+ HadamardRotation(&s[6], &s[7], true, min, max);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+ } else {
+ HadamardRotation(&s[0], &s[7], false, min, max);
+ HadamardRotation(&s[1], &s[6], false, min, max);
+ HadamardRotation(&s[2], &s[5], false, min, max);
+ HadamardRotation(&s[3], &s[4], false, min, max);
+ }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+ }
+ Transpose4x4(&s[0], &s[0]);
+ Transpose4x4(&s[4], &s[4]);
+ StoreDst<4>(dst, step, 0, &s[0]);
+ StoreDst<4>(dst, step, 4, &s[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
+ const bool is_last_stage) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false, min, max);
+ HadamardRotation(&s[10], &s[11], true, min, max);
+ HadamardRotation(&s[12], &s[13], false, min, max);
+ HadamardRotation(&s[14], &s[15], true, min, max);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false, min, max);
+ HadamardRotation(&s[9], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[15], true, min, max);
+ HadamardRotation(&s[13], &s[14], true, min, max);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+ } else {
+ HadamardRotation(&s[0], &s[15], false, min, max);
+ HadamardRotation(&s[1], &s[14], false, min, max);
+ HadamardRotation(&s[2], &s[13], false, min, max);
+ HadamardRotation(&s[3], &s[12], false, min, max);
+ HadamardRotation(&s[4], &s[11], false, min, max);
+ HadamardRotation(&s[5], &s[10], false, min, max);
+ HadamardRotation(&s[6], &s[9], false, min, max);
+ HadamardRotation(&s[7], &s[8], false, min, max);
+ }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct16Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&s[idx], &s[idx]);
+ Transpose4x4(&s[idx + 4], &s[idx + 4]);
+ StoreDst<4>(dst, step, idx, &s[idx]);
+ StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
+ const bool is_last_stage) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false, min, max);
+ HadamardRotation(&s[18], &s[19], true, min, max);
+ HadamardRotation(&s[20], &s[21], false, min, max);
+ HadamardRotation(&s[22], &s[23], true, min, max);
+ HadamardRotation(&s[24], &s[25], false, min, max);
+ HadamardRotation(&s[26], &s[27], true, min, max);
+ HadamardRotation(&s[28], &s[29], false, min, max);
+ HadamardRotation(&s[30], &s[31], true, min, max);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false, min, max);
+ HadamardRotation(&s[17], &s[18], false, min, max);
+ HadamardRotation(&s[20], &s[23], true, min, max);
+ HadamardRotation(&s[21], &s[22], true, min, max);
+ HadamardRotation(&s[24], &s[27], false, min, max);
+ HadamardRotation(&s[25], &s[26], false, min, max);
+ HadamardRotation(&s[28], &s[31], true, min, max);
+ HadamardRotation(&s[29], &s[30], true, min, max);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false, min, max);
+ HadamardRotation(&s[17], &s[22], false, min, max);
+ HadamardRotation(&s[18], &s[21], false, min, max);
+ HadamardRotation(&s[19], &s[20], false, min, max);
+ HadamardRotation(&s[24], &s[31], true, min, max);
+ HadamardRotation(&s[25], &s[30], true, min, max);
+ HadamardRotation(&s[26], &s[29], true, min, max);
+ HadamardRotation(&s[27], &s[28], true, min, max);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+ } else {
+ HadamardRotation(&s[0], &s[31], false, min, max);
+ HadamardRotation(&s[1], &s[30], false, min, max);
+ HadamardRotation(&s[2], &s[29], false, min, max);
+ HadamardRotation(&s[3], &s[28], false, min, max);
+ HadamardRotation(&s[4], &s[27], false, min, max);
+ HadamardRotation(&s[5], &s[26], false, min, max);
+ HadamardRotation(&s[6], &s[25], false, min, max);
+ HadamardRotation(&s[7], &s[24], false, min, max);
+ HadamardRotation(&s[8], &s[23], false, min, max);
+ HadamardRotation(&s[9], &s[22], false, min, max);
+ HadamardRotation(&s[10], &s[21], false, min, max);
+ HadamardRotation(&s[11], &s[20], false, min, max);
+ HadamardRotation(&s[12], &s[19], false, min, max);
+ HadamardRotation(&s[13], &s[18], false, min, max);
+ HadamardRotation(&s[14], &s[17], false, min, max);
+ HadamardRotation(&s[15], &s[16], false, min, max);
+ }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (auto& o : output) {
+ o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<32>(dst, step, 0, &s[0]);
+ }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, min, max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, min, max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, min, max, /*is_last_stage=*/false);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false, min, max);
+ HadamardRotation(&s[34], &s[35], true, min, max);
+ HadamardRotation(&s[36], &s[37], false, min, max);
+ HadamardRotation(&s[38], &s[39], true, min, max);
+ HadamardRotation(&s[40], &s[41], false, min, max);
+ HadamardRotation(&s[42], &s[43], true, min, max);
+ HadamardRotation(&s[44], &s[45], false, min, max);
+ HadamardRotation(&s[46], &s[47], true, min, max);
+ HadamardRotation(&s[48], &s[49], false, min, max);
+ HadamardRotation(&s[50], &s[51], true, min, max);
+ HadamardRotation(&s[52], &s[53], false, min, max);
+ HadamardRotation(&s[54], &s[55], true, min, max);
+ HadamardRotation(&s[56], &s[57], false, min, max);
+ HadamardRotation(&s[58], &s[59], true, min, max);
+ HadamardRotation(&s[60], &s[61], false, min, max);
+ HadamardRotation(&s[62], &s[63], true, min, max);
+
+ // stage 7.
+ ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false, min, max);
+ HadamardRotation(&s[33], &s[34], false, min, max);
+ HadamardRotation(&s[36], &s[39], true, min, max);
+ HadamardRotation(&s[37], &s[38], true, min, max);
+ HadamardRotation(&s[40], &s[43], false, min, max);
+ HadamardRotation(&s[41], &s[42], false, min, max);
+ HadamardRotation(&s[44], &s[47], true, min, max);
+ HadamardRotation(&s[45], &s[46], true, min, max);
+ HadamardRotation(&s[48], &s[51], false, min, max);
+ HadamardRotation(&s[49], &s[50], false, min, max);
+ HadamardRotation(&s[52], &s[55], true, min, max);
+ HadamardRotation(&s[53], &s[54], true, min, max);
+ HadamardRotation(&s[56], &s[59], false, min, max);
+ HadamardRotation(&s[57], &s[58], false, min, max);
+ HadamardRotation(&s[60], &s[63], true, min, max);
+ HadamardRotation(&s[61], &s[62], true, min, max);
+
+ // stage 16.
+ ButterflyRotation_4(&s[61], &s[34], 56, true);
+ ButterflyRotation_4(&s[60], &s[35], 56, true);
+ ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false, min, max);
+ HadamardRotation(&s[33], &s[38], false, min, max);
+ HadamardRotation(&s[34], &s[37], false, min, max);
+ HadamardRotation(&s[35], &s[36], false, min, max);
+ HadamardRotation(&s[40], &s[47], true, min, max);
+ HadamardRotation(&s[41], &s[46], true, min, max);
+ HadamardRotation(&s[42], &s[45], true, min, max);
+ HadamardRotation(&s[43], &s[44], true, min, max);
+ HadamardRotation(&s[48], &s[55], false, min, max);
+ HadamardRotation(&s[49], &s[54], false, min, max);
+ HadamardRotation(&s[50], &s[53], false, min, max);
+ HadamardRotation(&s[51], &s[52], false, min, max);
+ HadamardRotation(&s[56], &s[63], true, min, max);
+ HadamardRotation(&s[57], &s[62], true, min, max);
+ HadamardRotation(&s[58], &s[61], true, min, max);
+ HadamardRotation(&s[59], &s[60], true, min, max);
+
+ // stage 25.
+ ButterflyRotation_4(&s[59], &s[36], 48, true);
+ ButterflyRotation_4(&s[58], &s[37], 48, true);
+ ButterflyRotation_4(&s[57], &s[38], 48, true);
+ ButterflyRotation_4(&s[56], &s[39], 48, true);
+ ButterflyRotation_4(&s[55], &s[40], 112, true);
+ ButterflyRotation_4(&s[54], &s[41], 112, true);
+ ButterflyRotation_4(&s[53], &s[42], 112, true);
+ ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false, min, max);
+ HadamardRotation(&s[33], &s[46], false, min, max);
+ HadamardRotation(&s[34], &s[45], false, min, max);
+ HadamardRotation(&s[35], &s[44], false, min, max);
+ HadamardRotation(&s[36], &s[43], false, min, max);
+ HadamardRotation(&s[37], &s[42], false, min, max);
+ HadamardRotation(&s[38], &s[41], false, min, max);
+ HadamardRotation(&s[39], &s[40], false, min, max);
+ HadamardRotation(&s[48], &s[63], true, min, max);
+ HadamardRotation(&s[49], &s[62], true, min, max);
+ HadamardRotation(&s[50], &s[61], true, min, max);
+ HadamardRotation(&s[51], &s[60], true, min, max);
+ HadamardRotation(&s[52], &s[59], true, min, max);
+ HadamardRotation(&s[53], &s[58], true, min, max);
+ HadamardRotation(&s[54], &s[57], true, min, max);
+ HadamardRotation(&s[55], &s[56], true, min, max);
+
+ // stage 30.
+ ButterflyRotation_4(&s[55], &s[40], 32, true);
+ ButterflyRotation_4(&s[54], &s[41], 32, true);
+ ButterflyRotation_4(&s[53], &s[42], 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 32, true);
+ ButterflyRotation_4(&s[50], &s[45], 32, true);
+ ButterflyRotation_4(&s[49], &s[46], 32, true);
+ ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false, min, max);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false, min, max);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false, min, max);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false, min, max);
+ }
+ //-- end dct 64 stages
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (auto& o : output) {
+ o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<64>(dst, step, 0, &s[0]);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+ int32x4_t x[4];
+
+ if (is_row) {
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
+ }
+
+ // stage 1.
+ s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+ s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+ const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+ // stage 3.
+ s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+ s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ x[0] = vrshrq_n_s32(x0, 12);
+ x[1] = vrshrq_n_s32(x1, 12);
+ x[2] = vrshrq_n_s32(s[2], 12);
+ x[3] = vrshrq_n_s32(x3, 12);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+ x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+ x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+ x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, x);
+ }
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[2];
+
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src0_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+ const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+ // vqrshlq_s32 will shift right if shift value is negative.
+ vst1q_s32(dst,
+ vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+ s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+ s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+ const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+ const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+ const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+ vst1q_s32(&dst[i], dst_0);
+ vst1q_s32(&dst[i + width * 1], dst_1);
+ vst1q_s32(&dst[i + width * 2], dst_2);
+ vst1q_s32(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false, min, max);
+ HadamardRotation(&s[1], &s[5], false, min, max);
+ HadamardRotation(&s[2], &s[6], false, min, max);
+ HadamardRotation(&s[3], &s[7], false, min, max);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false, min, max);
+ HadamardRotation(&s[4], &s[6], false, min, max);
+ HadamardRotation(&s[1], &s[3], false, min, max);
+ HadamardRotation(&s[5], &s[7], false, min, max);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (auto& i : x) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+ }
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ StoreDst<4>(dst, step, 0, &x[0]);
+ StoreDst<4>(dst, step, 4, &x[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false, min, max);
+ HadamardRotation(&s[1], &s[9], false, min, max);
+ HadamardRotation(&s[2], &s[10], false, min, max);
+ HadamardRotation(&s[3], &s[11], false, min, max);
+ HadamardRotation(&s[4], &s[12], false, min, max);
+ HadamardRotation(&s[5], &s[13], false, min, max);
+ HadamardRotation(&s[6], &s[14], false, min, max);
+ HadamardRotation(&s[7], &s[15], false, min, max);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false, min, max);
+ HadamardRotation(&s[8], &s[12], false, min, max);
+ HadamardRotation(&s[1], &s[5], false, min, max);
+ HadamardRotation(&s[9], &s[13], false, min, max);
+ HadamardRotation(&s[2], &s[6], false, min, max);
+ HadamardRotation(&s[10], &s[14], false, min, max);
+ HadamardRotation(&s[3], &s[7], false, min, max);
+ HadamardRotation(&s[11], &s[15], false, min, max);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false, min, max);
+ HadamardRotation(&s[4], &s[6], false, min, max);
+ HadamardRotation(&s[8], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[14], false, min, max);
+ HadamardRotation(&s[1], &s[3], false, min, max);
+ HadamardRotation(&s[5], &s[7], false, min, max);
+ HadamardRotation(&s[9], &s[11], false, min, max);
+ HadamardRotation(&s[13], &s[15], false, min, max);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (auto& i : x) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ StoreDst<4>(dst, step, idx, &x[idx]);
+ StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int i = 0;
+ do {
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int32_t* LIBGAV1_RESTRICT source) {
+ static_assert(identity_size == 4 || identity_size == 8 ||
+ identity_size == 16 || identity_size == 32,
+ "Invalid identity_size.");
+ const int stride = frame.columns();
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (identity_size < 32) {
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[i * 4]);
+ v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst);
+ frame_data.val[1] = vld1_u16(dst + stride);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ dst += stride << 1;
+ i += 2;
+ } while (i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ b.val[0] =
+ vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] =
+ vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4,
+ vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int32x4_t v_dst_i = vld1q_s32(&source[row + j]);
+ const uint16x4_t frame_data = vld1_u16(dst + j);
+ const int32x4_t a = vrshrq_n_s32(v_dst_i, 2);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ const uint16x4_t d = vmin_u16(vqmovun_s32(b), v_max_bitdepth);
+ vst1_u16(dst + j, d);
+ j += 4;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int32_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+ const int32x4_t v_dst_row =
+ vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+ const int32x4_t v_dst_col =
+ vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ v_src_round.val[0] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+ v_src_round.val[1] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+ v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+ v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+ v_dst_col.val[0] =
+ vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+ v_dst_col.val[1] =
+ vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+ a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+ const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+ const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ int32x4x2_t v_src;
+ v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+ v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+ vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 32; j += 4) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step + j]);
+ const int32x4_t v_dst_i = vqaddq_s32(v_src, v_src);
+ vst1q_s32(&dst[i * step + j], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x2_t v_src0 = vdup_n_s32(dst[0]);
+ const int32x2_t v_src =
+ vqrdmulh_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const int32x2_t v_dst_0 = vqadd_s32(v_src, v_src);
+ vst1_lane_s32(dst, v_dst_0, 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint16_t* LIBGAV1_RESTRICT dst,
+ const int dst_stride,
+ const void* LIBGAV1_RESTRICT source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int32_t*>(source);
+ int32x4_t s[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int32_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int32_t g = f >> 1;
+ f = f - (f >> 1);
+ const int32_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int32_t i = (src[0] >> 4);
+ s[0] = vdupq_n_s32(h);
+ s[0] = vsetq_lane_s32(f, s[0], 0);
+ s[1] = vdupq_n_s32(i);
+ s[1] = vsetq_lane_s32(g, s[1], 0);
+ s[2] = s[3] = s[1];
+ } else {
+ // Load the 4x4 source in transposed form.
+ int32x4x4_t columns = vld4q_s32(src);
+
+ // Shift right and permute the columns for the WHT.
+ s[0] = vshrq_n_s32(columns.val[0], 2);
+ s[2] = vshrq_n_s32(columns.val[1], 2);
+ s[3] = vshrq_n_s32(columns.val[2], 2);
+ s[1] = vshrq_n_s32(columns.val[3], 2);
+
+ // Row transforms.
+ s[0] = vaddq_s32(s[0], s[2]);
+ s[3] = vsubq_s32(s[3], s[1]);
+ int32x4_t e = vhsubq_s32(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsubq_s32(e, s[1]);
+ s[2] = vsubq_s32(e, s[2]);
+ s[0] = vsubq_s32(s[0], s[1]);
+ s[3] = vaddq_s32(s[3], s[2]);
+
+ int32x4_t x[4];
+ Transpose4x4(s, x);
+
+ s[0] = x[0];
+ s[2] = x[1];
+ s[3] = x[2];
+ s[1] = x[3];
+
+ // Column transforms.
+ s[0] = vaddq_s32(s[0], s[2]);
+ s[3] = vsubq_s32(s[3], s[1]);
+ e = vhsubq_s32(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsubq_s32(e, s[1]);
+ s[2] = vsubq_s32(e, s[2]);
+ s[0] = vsubq_s32(s[0], s[1]);
+ s[3] = vaddq_s32(s[3], s[2]);
+ }
+
+ // Store to frame.
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ for (int row = 0; row < 4; row += 1) {
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t b = vaddw_s16(s[row], vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += dst_stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ const int32x4_t c = vld1q_s32(&source[i + 8]);
+ const int32x4_t d = vld1q_s32(&source[i + 12]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ const int32x4_t c_rev = vrev64q_s32(c);
+ const int32x4_t d_rev = vrev64q_s32(d);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+ vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t a_lo = vld1q_s32(&source[i]);
+ const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+ const int32x4_t b_lo =
+ vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t b_hi =
+ vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+ vst1q_s32(&source[i], b_lo);
+ vst1q_s32(&source[i + 4], b_hi);
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t residual0 = vld1q_s32(&source[i]);
+ const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+ vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+ vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int32_t* LIBGAV1_RESTRICT source,
+ TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int32x4_t residual = vld1q_s32(&source[row]);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int32x4_t residual = vld1q_s32(&source[row + j]);
+ const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+ const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+ const uint32x4_t b =
+ vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+ const uint32x4_t b_hi =
+ vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+ vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+ vdupq_n_u16((1 << kBitdepth10) - 1)));
+ j += 8;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+ row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct8 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+ row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+ data += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct16 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct32 rows in parallel per iteration.
+ Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+ data += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<32>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct64 rows in parallel per iteration.
+ Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+ data += 128 * 2;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<64>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+ /*transpose=*/true, row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 4 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 4 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ const int shift = tx_height > 8 ? 1 : 0;
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON(src, /*step=*/4, shift);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+ // bit value.
+ if ((tx_height & 0x18) != 0) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+ const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+ vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+ vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+ }
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the identity
+ // transform and shifted by 1 afterwards.
+ auto* src = static_cast<int32_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = adjusted_tx_height;
+ do {
+ Identity32Row16_NEON(src, /*step=*/32);
+ src += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/, void* /*src_buffer*/,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int32_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ uint16_t* dst = frame[start_y] + start_x;
+ const int dst_stride = frame.columns();
+ Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ Identity32TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ Identity32TransformLoopColumn_NEON;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ Wht4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ Wht4TransformLoopColumn_NEON;
+}
+
+} // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
+LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
+ int16x8_t out[8]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+ const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]);
+ const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+ const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8],
+ uint16x8_t out[4]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1]));
+ uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3]));
+ uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5]));
+ uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7]));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b1.val[0]));
+ uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+ vreinterpret_u32_u16(b1.val[1]));
+ uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+ vreinterpret_u32_u16(b3.val[0]));
+ uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+ vreinterpret_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+ vreinterpret_u16_u32(c2.val[0]));
+ out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+ vreinterpret_u16_u32(c3.val[0]));
+ out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+ vreinterpret_u16_u32(c2.val[1]));
+ out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+ vreinterpret_u16_u32(c3.val[1]));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8],
+ int16x8_t out[4]) {
+ Transpose4x8To8x4(reinterpret_cast<const uint16x8_t*>(in),
+ reinterpret_cast<uint16x8_t*>(out));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4],
+ int16x8_t out[8]) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+
+ // The upper 8 bytes are don't cares.
+ // out[0]: 00 10 20 30 04 14 24 34
+ // out[1]: 01 11 21 31 05 15 25 35
+ // out[2]: 02 12 22 32 06 16 26 36
+ // out[3]: 03 13 23 33 07 17 27 37
+ // out[4]: 04 14 24 34 04 14 24 34
+ // out[5]: 05 15 25 35 05 15 25 35
+ // out[6]: 06 16 26 36 06 16 26 36
+ // out[7]: 07 17 27 37 07 17 27 37
+ out[0] = vreinterpretq_s16_s32(c0.val[0]);
+ out[1] = vreinterpretq_s16_s32(c1.val[0]);
+ out[2] = vreinterpretq_s16_s32(c0.val[1]);
+ out[3] = vreinterpretq_s16_s32(c1.val[1]);
+ out[4] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0])));
+ out[5] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0])));
+ out[6] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1])));
+ out[7] = vreinterpretq_s16_s32(
+ vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1])));
+}
+
+//------------------------------------------------------------------------------
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+ int32_t stride, int32_t idx,
+ const int16x8_t* const s) {
+ assert(store_count % 4 == 0);
+ assert(store_width == 8 || store_width == 16);
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (store_width == 16) {
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s16(&dst[i * stride + idx], (s[i]));
+ vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1]));
+ vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2]));
+ vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3]));
+ }
+ } else {
+ // store_width == 8
+ for (int i = 0; i < store_count; i += 4) {
+ vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i]));
+ vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1]));
+ vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2]));
+ vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3]));
+ }
+ }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+ int32_t stride, int32_t idx, int16x8_t* x) {
+ assert(load_count % 4 == 0);
+ assert(load_width == 8 || load_width == 16);
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (load_width == 16) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s16(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]);
+ }
+ } else {
+ // load_width == 8
+ const int64x2_t zero = vdupq_n_s64(0);
+ for (int i = 0; i < load_count; i += 4) {
+ // The src buffer is aligned to 32 bytes. Each load will always be 8
+ // byte aligned.
+ x[i] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[i * stride + idx]), zero, 0));
+ x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 1) * stride + idx]), zero,
+ 0));
+ x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 2) * stride + idx]), zero,
+ 0));
+ x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64(
+ reinterpret_cast<const int64_t*>(&src[(i + 3) * stride + idx]), zero,
+ 0));
+ }
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+ const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+ const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+ const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+ const int16x8_t x = vcombine_s16(x1, x1);
+ const int16x8_t y = vcombine_s16(y1, y1);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+ const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+ const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+ const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+ const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+ const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+ const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128);
+ const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128);
+ const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+ const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+ const int16x8_t x = vcombine_s16(x1, x1_hi);
+ const int16x8_t y = vcombine_s16(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
+ int16x8_t* b,
+ const int angle,
+ const bool flip) {
+ // Clang < 14 targeting armv8.1-a+ optimizes vqrdmulhq_n_s16 and vqsubq_s16
+ // (in HadamardRotation) into vqrdmlshq_s16 resulting in an "off by one"
+ // error. This behavior was fixed in 14.0.0:
+ // https://github.com/llvm/llvm-project/commit/82973edfb72a95b442fa6d2bb404e15a4031855e
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+ defined(__clang__) && __clang_major__ < 14
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128);
+ const int32x4_t y0 = vmull_n_s16(vget_low_s16(*b), cos128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+ const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*b), -sin128);
+ const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*b), cos128);
+ const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+ const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+ const int16x8_t x = vcombine_s16(x1, x1_hi);
+ const int16x8_t y = vcombine_s16(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+#else
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ // For this function, the max value returned by Sin128() is 4091, which fits
+ // inside 12 bits. This leaves room for the sign bit and the 3 left shifted
+ // bits.
+ assert(sin128 <= 0xfff);
+ const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a,
+ int16x8_t* b,
+ const int angle,
+ const bool flip) {
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+ defined(__clang__) // ARM v8.1-A
+ // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
+ // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
+ // vqrdmulhq_n_s16().
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmull_n_s16(vget_low_s16(*a), cos128);
+ const int32x4_t y0 = vmull_n_s16(vget_low_s16(*a), sin128);
+ const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+ const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+ const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+ const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+ const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+ const int16x8_t x = vcombine_s16(x1, x1_hi);
+ const int16x8_t y = vcombine_s16(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+#else
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b,
+ bool flip) {
+ int16x8_t x, y;
+ if (flip) {
+ y = vqaddq_s16(*b, *a);
+ x = vqsubq_s16(*b, *a);
+ } else {
+ x = vqaddq_s16(*a, *b);
+ y = vqsubq_s16(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
+ const int16_t cos128 = Cos128(32);
+ const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
+ // vqrshlq_s16 will shift right if shift value is negative.
+ const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
+
+ if (width == 4) {
+ vst1_s16(dst, vget_low_s16(xy_shifted));
+ } else {
+ for (int i = 0; i < width; i += 8) {
+ vst1q_s16(dst, xy_shifted);
+ dst += 8;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int16x4_t v_src = vld1_s16(dst);
+ const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
+ vst1_s16(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&dst[i]);
+ const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
+ vst1q_s16(&dst[i], xy);
+ i += 8;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[4], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ assert(step == 4);
+ int16x8x4_t y = vld4q_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]);
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s16(dst, y);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]);
+ vst4_s16(dst, y);
+ } else {
+ StoreDst<8, 4>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false);
+ HadamardRotation(&s[6], &s[7], true);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else if (transpose) {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, s);
+ }
+ } else if (transpose) {
+ dsp::Transpose8x8(s);
+ StoreDst<16, 8>(dst, step, 0, s);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false);
+ HadamardRotation(&s[10], &s[11], true);
+ HadamardRotation(&s[12], &s[13], false);
+ HadamardRotation(&s[14], &s[15], true);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false);
+ HadamardRotation(&s[9], &s[10], false);
+ HadamardRotation(&s[12], &s[15], true);
+ HadamardRotation(&s[13], &s[14], true);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (auto& i : s) {
+ i = vqrshlq_s16(i, v_row_shift);
+ }
+ }
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4(&s[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, s);
+ }
+ } else if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ dsp::Transpose8x8(&s[idx]);
+ StoreDst<16, 8>(dst, step, idx, &s[idx]);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false);
+ HadamardRotation(&s[18], &s[19], true);
+ HadamardRotation(&s[20], &s[21], false);
+ HadamardRotation(&s[22], &s[23], true);
+ HadamardRotation(&s[24], &s[25], false);
+ HadamardRotation(&s[26], &s[27], true);
+ HadamardRotation(&s[28], &s[29], false);
+ HadamardRotation(&s[30], &s[31], true);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false);
+ HadamardRotation(&s[17], &s[18], false);
+ HadamardRotation(&s[20], &s[23], true);
+ HadamardRotation(&s[21], &s[22], true);
+ HadamardRotation(&s[24], &s[27], false);
+ HadamardRotation(&s[25], &s[26], false);
+ HadamardRotation(&s[28], &s[31], true);
+ HadamardRotation(&s[29], &s[30], true);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false);
+ HadamardRotation(&s[17], &s[22], false);
+ HadamardRotation(&s[18], &s[21], false);
+ HadamardRotation(&s[19], &s[20], false);
+ HadamardRotation(&s[24], &s[31], true);
+ HadamardRotation(&s[25], &s[30], true);
+ HadamardRotation(&s[26], &s[29], true);
+ HadamardRotation(&s[27], &s[28], true);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_8>(s);
+ Dct8Stages<ButterflyRotation_8>(s);
+ Dct16Stages<ButterflyRotation_8>(s);
+ Dct32Stages<ButterflyRotation_8>(s);
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&s[idx], output);
+ for (auto& o : output) {
+ o = vqrshlq_s16(o, v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 32>(dst, step, 0, s);
+ }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false);
+ HadamardRotation(&s[34], &s[35], true);
+ HadamardRotation(&s[36], &s[37], false);
+ HadamardRotation(&s[38], &s[39], true);
+ HadamardRotation(&s[40], &s[41], false);
+ HadamardRotation(&s[42], &s[43], true);
+ HadamardRotation(&s[44], &s[45], false);
+ HadamardRotation(&s[46], &s[47], true);
+ HadamardRotation(&s[48], &s[49], false);
+ HadamardRotation(&s[50], &s[51], true);
+ HadamardRotation(&s[52], &s[53], false);
+ HadamardRotation(&s[54], &s[55], true);
+ HadamardRotation(&s[56], &s[57], false);
+ HadamardRotation(&s[58], &s[59], true);
+ HadamardRotation(&s[60], &s[61], false);
+ HadamardRotation(&s[62], &s[63], true);
+
+ // stage 7.
+ ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false);
+ HadamardRotation(&s[33], &s[34], false);
+ HadamardRotation(&s[36], &s[39], true);
+ HadamardRotation(&s[37], &s[38], true);
+ HadamardRotation(&s[40], &s[43], false);
+ HadamardRotation(&s[41], &s[42], false);
+ HadamardRotation(&s[44], &s[47], true);
+ HadamardRotation(&s[45], &s[46], true);
+ HadamardRotation(&s[48], &s[51], false);
+ HadamardRotation(&s[49], &s[50], false);
+ HadamardRotation(&s[52], &s[55], true);
+ HadamardRotation(&s[53], &s[54], true);
+ HadamardRotation(&s[56], &s[59], false);
+ HadamardRotation(&s[57], &s[58], false);
+ HadamardRotation(&s[60], &s[63], true);
+ HadamardRotation(&s[61], &s[62], true);
+
+ // stage 16.
+ ButterflyRotation_8(&s[61], &s[34], 56, true);
+ ButterflyRotation_8(&s[60], &s[35], 56, true);
+ ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false);
+ HadamardRotation(&s[33], &s[38], false);
+ HadamardRotation(&s[34], &s[37], false);
+ HadamardRotation(&s[35], &s[36], false);
+ HadamardRotation(&s[40], &s[47], true);
+ HadamardRotation(&s[41], &s[46], true);
+ HadamardRotation(&s[42], &s[45], true);
+ HadamardRotation(&s[43], &s[44], true);
+ HadamardRotation(&s[48], &s[55], false);
+ HadamardRotation(&s[49], &s[54], false);
+ HadamardRotation(&s[50], &s[53], false);
+ HadamardRotation(&s[51], &s[52], false);
+ HadamardRotation(&s[56], &s[63], true);
+ HadamardRotation(&s[57], &s[62], true);
+ HadamardRotation(&s[58], &s[61], true);
+ HadamardRotation(&s[59], &s[60], true);
+
+ // stage 25.
+ ButterflyRotation_8(&s[59], &s[36], 48, true);
+ ButterflyRotation_8(&s[58], &s[37], 48, true);
+ ButterflyRotation_8(&s[57], &s[38], 48, true);
+ ButterflyRotation_8(&s[56], &s[39], 48, true);
+ ButterflyRotation_8(&s[55], &s[40], 112, true);
+ ButterflyRotation_8(&s[54], &s[41], 112, true);
+ ButterflyRotation_8(&s[53], &s[42], 112, true);
+ ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false);
+ HadamardRotation(&s[33], &s[46], false);
+ HadamardRotation(&s[34], &s[45], false);
+ HadamardRotation(&s[35], &s[44], false);
+ HadamardRotation(&s[36], &s[43], false);
+ HadamardRotation(&s[37], &s[42], false);
+ HadamardRotation(&s[38], &s[41], false);
+ HadamardRotation(&s[39], &s[40], false);
+ HadamardRotation(&s[48], &s[63], true);
+ HadamardRotation(&s[49], &s[62], true);
+ HadamardRotation(&s[50], &s[61], true);
+ HadamardRotation(&s[51], &s[60], true);
+ HadamardRotation(&s[52], &s[59], true);
+ HadamardRotation(&s[53], &s[58], true);
+ HadamardRotation(&s[54], &s[57], true);
+ HadamardRotation(&s[55], &s[56], true);
+
+ // stage 30.
+ ButterflyRotation_8(&s[55], &s[40], 32, true);
+ ButterflyRotation_8(&s[54], &s[41], 32, true);
+ ButterflyRotation_8(&s[53], &s[42], 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 32, true);
+ ButterflyRotation_8(&s[50], &s[45], 32, true);
+ ButterflyRotation_8(&s[49], &s[46], 32, true);
+ ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+ }
+ //-- end dct 64 stages
+
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&s[idx], output);
+ for (auto& o : output) {
+ o = vqrshlq_s16(o, v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 64>(dst, step, 0, s);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int32x4_t s[7];
+ int16x4_t x[4];
+
+ if (transpose) {
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ x[0] = vld1_s16(dst);
+ x[1] = vld1_s16(dst + 1 * step);
+ x[2] = vld1_s16(dst + 2 * step);
+ x[3] = vld1_s16(dst + 3 * step);
+ }
+
+ // stage 1.
+ s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]);
+ s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubl_s16(x[0], x[2]);
+ const int32x4_t b7 = vaddw_s16(a7, x[3]);
+
+ // stage 3.
+ s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]);
+
+ s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+ const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
+ const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+ x[0] = dst_0;
+ x[1] = dst_1;
+ x[2] = dst_2;
+ x[3] = dst_3;
+
+ if (transpose) {
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4_s16(dst, y);
+ } else {
+ vst1_s16(dst, x[0]);
+ vst1_s16(dst + 1 * step, x[1]);
+ vst1_s16(dst + 2 * step, x[2]);
+ vst1_s16(dst + 3 * step, x[3]);
+ }
+}
+
+alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int32x4_t s[2];
+
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12);
+
+ // vqrshlq_s16 will shift right if shift value is negative.
+ vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift)));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&dst[i]);
+
+ s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
+ s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+ const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+ const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12);
+ const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+ vst1_s16(&dst[i], dst_0);
+ vst1_s16(&dst[i + width * 1], dst_1);
+ vst1_s16(&dst[i + width * 2], dst_2);
+ vst1_s16(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[3], &s[7], false);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ int16x8_t output[4];
+ Transpose4x8To8x4(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ dsp::Transpose8x8(x);
+ StoreDst<16, 8>(dst, step, 0, x);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8];
+
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ // stage 1.
+ s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int16x8_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+ vst1q_lane_s16(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[8];
+
+ int i = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int16x8_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s16(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s16(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s16(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ int16x8_t input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false);
+ HadamardRotation(&s[1], &s[9], false);
+ HadamardRotation(&s[2], &s[10], false);
+ HadamardRotation(&s[3], &s[11], false);
+ HadamardRotation(&s[4], &s[12], false);
+ HadamardRotation(&s[5], &s[13], false);
+ HadamardRotation(&s[6], &s[14], false);
+ HadamardRotation(&s[7], &s[15], false);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[8], &s[12], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[9], &s[13], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[10], &s[14], false);
+ HadamardRotation(&s[3], &s[7], false);
+ HadamardRotation(&s[11], &s[15], false);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[8], &s[10], false);
+ HadamardRotation(&s[12], &s[14], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+ HadamardRotation(&s[9], &s[11], false);
+ HadamardRotation(&s[13], &s[15], false);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s16(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s16(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s16(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s16(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s16(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s16(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s16(s[1]);
+
+ if (stage_is_rectangular) {
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ int16x8_t output[4];
+ Transpose4x8To8x4(x, output);
+ for (auto& o : output) {
+ o = vqrshlq_s16(o, v_row_shift);
+ }
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4(&x[8], output);
+ for (auto& o : output) {
+ o = vqrshlq_s16(o, v_row_shift);
+ }
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (is_row) {
+ const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+ for (int idx = 0; idx < 16; idx += 8) {
+ int16x8_t output[8];
+ Transpose8x8(&x[idx], output);
+ for (auto& o : output) {
+ o = vqrshlq_s16(o, v_row_shift);
+ }
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s16(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s16(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s16(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s16(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s16(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s16(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s16(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s16(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int16x8_t s[16];
+ int16x8_t x[16];
+
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
+ const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ // stage 1.
+ s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+ vst1q_lane_s16(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ int16x8_t s[16];
+ int16x8_t x[16];
+ const int16x8_t v_src = vld1q_s16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ if (is_row_shift) {
+ const int shift = 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; i += 2) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s16(&dst[i * step],
+ vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+ }
+ } else {
+ for (int i = 0; i < 4; i += 2) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int16x8_t a =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ const int16x8_t b = vqaddq_s16(v_src, a);
+ vst1q_s16(&dst[i * step], b);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ if (identity_size < 32) {
+ if (tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+
+ int16x4_t v_dst_i;
+ if (identity_size == 4) {
+ const int16x4_t v_src_fraction =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ v_dst_i = vqadd_s16(v_src, v_src_fraction);
+ } else if (identity_size == 8) {
+ v_dst_i = vqadd_s16(v_src, v_src);
+ } else { // identity_size == 16
+ const int16x4_t v_src_mult =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+ const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src);
+ v_dst_i = vqadd_s16(v_srcx2, v_src_mult);
+ }
+
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(v_dst_i, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&source[row + j]);
+
+ int16x8_t v_dst_i;
+ if (identity_size == 4) {
+ const int16x8_t v_src_fraction =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ v_dst_i = vqaddq_s16(v_src, v_src_fraction);
+ } else if (identity_size == 8) {
+ v_dst_i = vqaddq_s16(v_src, v_src);
+ } else { // identity_size == 16
+ const int16x8_t v_src_mult =
+ vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+ const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+ v_dst_i = vqaddq_s16(v_src_mult, v_srcx2);
+ }
+
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_i, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_dst_i = vld1q_s16(&source[row + j]);
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_i, 2);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ int i = 0;
+ do {
+ const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+ const int16x4_t v_src_mult =
+ vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+ const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult);
+ const int16x4_t v_src_mult2 =
+ vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+ const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2);
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(v_dst_col, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int16x8_t v_src = vld1q_s16(&source[row + j]);
+ const int16x8_t v_src_round =
+ vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+ const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round);
+ const int16x8_t v_src_mult2 =
+ vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+ const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2);
+ const uint8x8_t frame_data = vld1_u8(dst + j);
+ const int16x8_t a = vrshrq_n_s16(v_dst_col, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst + j, d);
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ const int16x8_t a = vrshrq_n_s16(v_src, 1);
+ vst1q_s16(&dst[i * step], a);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+ vst1q_s16(&dst[i * step], v_srcx2);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
+ const int32x4_t v_src_mult_lo =
+ vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi = vmlal_n_s16(
+ v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s16(&dst[i * step + j * 8],
+ vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+ const int16x4_t v_src_round =
+ vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo =
+ vmlal_s16(v_dual_round, (v_src), v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 32; j += 8) {
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
+ vst1q_s16(&dst[i * step + j], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+ const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
+ vst1_lane_s16(dst, v_dst_0, 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix
+// for the WHT. The input matrix is in two "wide" int16x8_t variables. The
+// output matrix is in four int16x4_t variables.
+//
+// Input:
+// in[0]: 00 01 02 03 10 11 12 13
+// in[1]: 20 21 22 23 30 31 32 33
+// Output:
+// out[0]: 00 10 20 30
+// out[1]: 03 13 23 33
+// out[2]: 01 11 21 31
+// out[3]: 02 12 22 32
+LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
+ const int16x8_t in[2], int16x4_t out[4]) {
+ // Swap 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03 10 11 12 13
+ // in[1]: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+
+ // Swap 16 bit elements. Goes from:
+ // vget_low_s32(b0.val[0]): 00 01 20 21
+ // vget_high_s32(b0.val[0]): 10 11 30 31
+ // vget_low_s32(b0.val[1]): 02 03 22 23
+ // vget_high_s32(b0.val[1]): 12 13 32 33
+ // to:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 01 11 21 32
+ // c1.val[0]: 02 12 22 32
+ // c1.val[1]: 03 13 23 33
+
+ const int16x4x2_t c0 =
+ vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+ const int16x4x2_t c1 =
+ vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+
+ out[0] = c0.val[0];
+ out[1] = c1.val[1];
+ out[2] = c0.val[1];
+ out[3] = c1.val[0];
+}
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* LIBGAV1_RESTRICT dst,
+ const int dst_stride,
+ const void* LIBGAV1_RESTRICT source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int16_t*>(source);
+ int16x4_t s[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int16_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int16_t g = f >> 1;
+ f = f - (f >> 1);
+ const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int16_t i = (src[0] >> 4);
+ s[0] = vdup_n_s16(h);
+ s[0] = vset_lane_s16(f, s[0], 0);
+ s[1] = vdup_n_s16(i);
+ s[1] = vset_lane_s16(g, s[1], 0);
+ s[2] = s[3] = s[1];
+ } else {
+ // Load the 4x4 source in transposed form.
+ int16x4x4_t columns = vld4_s16(src);
+ // Shift right and permute the columns for the WHT.
+ s[0] = vshr_n_s16(columns.val[0], 2);
+ s[2] = vshr_n_s16(columns.val[1], 2);
+ s[3] = vshr_n_s16(columns.val[2], 2);
+ s[1] = vshr_n_s16(columns.val[3], 2);
+
+ // Row transforms.
+ s[0] = vadd_s16(s[0], s[2]);
+ s[3] = vsub_s16(s[3], s[1]);
+ int16x4_t e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsub_s16(e, s[1]);
+ s[2] = vsub_s16(e, s[2]);
+ s[0] = vsub_s16(s[0], s[1]);
+ s[3] = vadd_s16(s[3], s[2]);
+
+ int16x8_t x[2];
+ x[0] = vcombine_s16(s[0], s[1]);
+ x[1] = vcombine_s16(s[2], s[3]);
+ TransposeAndPermute4x4WideInput(x, s);
+
+ // Column transforms.
+ s[0] = vadd_s16(s[0], s[2]);
+ s[3] = vsub_s16(s[3], s[1]);
+ e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsub_s16(e, s[1]);
+ s[2] = vsub_s16(e, s[2]);
+ s[0] = vsub_s16(s[0], s[1]);
+ s[3] = vadd_s16(s[3], s[2]);
+ }
+
+ // Store to frame.
+ uint8x8_t frame_data = vdup_n_u8(0);
+ for (int row = 0; row < 4; row += 2) {
+ frame_data = Load4<0>(dst, frame_data);
+ frame_data = Load4<1>(dst + dst_stride, frame_data);
+ const int16x8_t residual = vcombine_s16(s[row], s[row + 1]);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data);
+ frame_data = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, frame_data);
+ dst += dst_stride;
+ StoreHi4(dst, frame_data);
+ dst += dst_stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vld1q_s16(&source[i + 8]);
+ const int16x8_t c = vrev64q_s16(a);
+ const int16x8_t d = vrev64q_s16(b);
+ vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d)));
+ vst1q_s16(&source[i + 8],
+ vcombine_s16(vget_high_s16(c), vget_low_s16(c)));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vrev64q_s16(a);
+ vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b)));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ vst1q_s16(&source[i], vrev64q_s16(a));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i]);
+ const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+ vst1q_s16(&source[i], b);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+ int j = 0;
+ do {
+ const int16x8_t a = vld1q_s16(&source[i * tx_width + j]);
+ const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+ vst1q_s16(&source[i * tx_width + j], b);
+ j += 8;
+ } while (j < non_zero_width);
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s16 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int16x8_t residual = vld1q_s16(&source[i]);
+ vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ for (int j = 0; j < tx_width; j += 8) {
+ const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]);
+ const int16x8_t residual_shifted =
+ vqrshlq_s16(residual, vdupq_n_s16(row_shift));
+ vst1q_s16(&source[i * tx_width + j], residual_shifted);
+ }
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int16_t* LIBGAV1_RESTRICT source,
+ TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ // Enable for 4x4, 4x8, 4x16
+ if (tx_height < 32 && tx_width == 4) {
+ uint8x8_t frame_data = vdup_n_u8(0);
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int16x4_t residual = vld1_s16(&source[row]);
+ frame_data = Load4<0>(dst, frame_data);
+ const int16x4_t a = vrshr_n_s16(residual, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ StoreLo4(dst, d);
+ dst += stride;
+ }
+ // Enable for 8x4, 8x8, 8x16, 8x32
+ } else if (tx_height < 64 && tx_width == 8) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+ const int16x8_t residual = vld1q_s16(&source[row]);
+ const uint8x8_t frame_data = vld1_u8(dst);
+ const int16x8_t a = vrshrq_n_s16(residual, 4);
+ const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+ vst1_u8(dst, d);
+ dst += stride;
+ }
+ // Remaining widths >= 16.
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int16x8_t residual = vld1q_s16(&source[row + j]);
+ const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]);
+ const uint8x16_t frame_data = vld1q_u8(frame[y] + x);
+ const int16x8_t a = vrshrq_n_s16(residual, 4);
+ const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4);
+ const uint16x8_t b =
+ vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data));
+ const uint16x8_t b_hi =
+ vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data));
+ vst1q_u8(frame[y] + x,
+ vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)),
+ vqmovun_s16(vreinterpretq_s16_u16(b_hi))));
+ j += 16;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+ /*transpose=*/true);
+ data += 32;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct4 columns in parallel.
+ Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct8 columns in parallel.
+ Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct16 columns in parallel.
+ Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
+ } else {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 8 1d dct16 columns in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, /*step=*/4, /*transpose=*/true);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, tx_width, /*transpose=*/false);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst8 columns in parallel.
+ Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
+ /*transpose=*/false);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst16 columns in parallel.
+ Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
+ } else {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 8 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_8, false>(
+ data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<false>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ } else {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<true>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = adjusted_tx_height;
+ do {
+ Identity32Row16_NEON(src, /*step=*/32);
+ src += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/, void* /*src_buffer*/,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int16_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ uint8_t* dst = frame[start_y] + start_x;
+ const int dst_stride = frame.columns();
+ Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ Identity32TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ Identity32TransformLoopColumn_NEON;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ Wht4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ Wht4TransformLoopColumn_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+ const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+ return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+ const uint16x4_t q0, const uint16x4_t q1,
+ const uint16_t outer_thresh) {
+ const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+ const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+ const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+ const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+ const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+ return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16x8_t abd_p2p3_q2q3,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const hev_mask,
+ uint16x4_t* const needs_filter4_mask) {
+ const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ // This includes cases where NeedsFilter4() is not true and so Filter2() will
+ // not be applied.
+ const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+ // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+ *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p0p2_q0q2) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, const uint16_t hev_thresh,
+ const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter6_mask,
+ uint16x4_t* const is_flat3_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+ *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+ inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+ const uint16x8_t abd_pn1p0_qn1q0,
+ const uint16x8_t abd_pn2p0_qn2q0) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+ const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter8_mask,
+ uint16x4_t* const is_flat4_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ const uint16x4_t is_flat4 =
+ IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+ *needs_filter8_mask =
+ NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+ inner_thresh, outer_mask);
+ // |is_flat4_mask| is used to decide where to use the result of Filter8.
+ // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+ // overriding the question of whether to use Filter8. Because Filter4 doesn't
+ // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+ // source value. To be correct, the mask must account for this override.
+ *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+ const uint16x8_t p1q1, const uint16x4_t hev_mask,
+ uint16x8_t* const p1q1_result,
+ uint16x8_t* const p0q0_result) {
+ const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // q0mp0 means "q0 minus p0".
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+ const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int16x4_t p1mq1_saturated =
+ Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+ const int16x4_t hev_option =
+ vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+ // Need to figure out what's going on here because there are some unnecessary
+ // tricks to accommodate 8x8 as smallest 8bpp vector
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four =
+ Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t plus_three =
+ Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+ const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+ // a3 = (a1 + 1) >> 1;
+ const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+ const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+ const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+ // Need to shift the second term or we end up with a2_ma2.
+ const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+ const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+ *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+ *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+ const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Offset by 2 uint16_t values to load from first p1 position.
+ auto* dst = static_cast<uint8_t*>(dest) - 4;
+ auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+ auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+ auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+ uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1)};
+ Transpose4x4(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ vst1_u16(dst_p1, output[0]);
+ vst1_u16(dst_p0, output[1]);
+ vst1_u16(dst_q0, output[2]);
+ vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions.
+ // The formula is regrouped to allow 3 doubling operations to be combined.
+ //
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^^^^^^
+ uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p0q0);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^
+ sum = vshlq_n_u16(sum, 1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^ ^^^^^^
+ // Should dual issue with the left shift.
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+ sum = vaddq_u16(sum, outer_sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+ const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+ vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Left side of the filter window.
+ auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Overread by 2 values. These overreads become the high halves of src_raw[2]
+ // and src_raw[3] after transpose.
+ uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ Transpose4x8(src_raw);
+ // p2, p1, p0, q0, q1, q2
+ const uint16x4_t src[6] = {
+ vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
+ vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
+ vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+ };
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ // dst_n starts at p2, so adjust to p1.
+ vst1_u16(dst_0 + 1, output[0]);
+ vst1_u16(dst_1 + 1, output[1]);
+ vst1_u16(dst_2 + 1, output[2]);
+ vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p2q2_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ const uint16x4_t src[8] = {
+ vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+ const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+ const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+ const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+ return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+ // To get desired pairs after transpose, one half should be reversed.
+ uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+
+ // src[0] = p0q0
+ // src[1] = p1q1
+ // src[2] = p2q2
+ // src[3] = p3q3
+ LoopFilterTranspose4x8(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+ vget_high_u16(src[1]), outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = src[0];
+ const uint16x8_t p1q1 = src[1];
+ const uint16x8_t p2q2 = src[2];
+ const uint16x8_t p3q3 = src[3];
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+ // After transpose, |output| will contain rows of the form:
+ // p0 p1 p2 p3 q0 q1 q2 q3
+ Transpose4x8(output);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+ vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+ vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+ vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+ const uint16x8_t p4q4, const uint16x8_t p3q3,
+ const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+ uint16x8_t* const p4q4_output,
+ uint16x8_t* const p3q3_output,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions.
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^^^^^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^^^^^^^^^^^^
+ uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+ sum = vaddq_u16(sum, p6q6_x7);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p5q5_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+ const uint16x8_t q3p3 = Transpose64(p3q3);
+ sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+ const uint16x8_t q4p4 = Transpose64(p4q4);
+ sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+ const uint16x8_t q5p5 = Transpose64(p5q5);
+ sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+ auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+ auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+ auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+ auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+ auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+ const uint16x4_t src[14] = {
+ vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+ vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+ vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+ const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+ const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+ const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+ const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+ const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+
+ vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+ vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+ vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+ vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+ vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+ vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+ uint16x8x2_t acdb;
+#if defined(__aarch64__)
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+ vreinterpretq_u64_u16(ab), 1));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+ vreinterpretq_u64_u16(ab), 0));
+#endif // defined(__aarch64__)
+ return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Low halves: p7 p6 p5 p4
+ // High halves: p3 p2 p1 p0
+ uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ // p7 will be the low half of src_p[0]. Not used until the end.
+ Transpose4x8(src_p);
+
+ // Low halves: q0 q1 q2 q3
+ // High halves: q4 q5 q6 q7
+ uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+ vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+ // q7 will be the high half of src_q[3]. Not used until the end.
+ Transpose4x8(src_q);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+ vget_low_u16(src_q[1]), outer_thresh);
+ const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+ const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+ const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+ const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 =
+ vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+ const uint16x8_t p5q5 =
+ vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+ const uint16x8_t p6q6 =
+ vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+ const uint16x8_t p7q7 =
+ vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+ // To get the correctly ordered rows from the transpose, we need:
+ // p7p3 p6p2 p5p1 p4p0
+ // q0q4 q1q5 q2q6 q3q7
+ const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+ const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+ const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+ const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+ uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+ p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+ Transpose4x8(output_p);
+ uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+ p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+ Transpose4x8(output_q);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, output_p[0]);
+ vst1q_u16(dst_0 + 8, output_q[0]);
+ vst1q_u16(dst_1, output_p[1]);
+ vst1q_u16(dst_1 + 8, output_q[1]);
+ vst1q_u16(dst_2, output_p[2]);
+ vst1q_u16(dst_2 + 8, output_q[2]);
+ vst1q_u16(dst_3, output_p[3]);
+ vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+} // namespace
+
+void LoopFilterInit10bpp_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Horizontal4_NEON;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Horizontal6_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Horizontal8_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Vertical14_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
+ const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
+ return vorr_u8(a, RightShiftVector<32>(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t outer_thresh) {
+ const uint8x8x2_t a = Interleave32(p0q0, p1q1);
+ const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
+ const uint8x8_t p0q0_double = vqadd_u8(b, b);
+ const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
+ const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
+ return vcle_u8(c, vdup_n_u8(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// OuterThreshold()
+inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t hev_thresh, const uint8_t outer_thresh,
+ const uint8_t inner_thresh, uint8x8_t* const hev_mask,
+ uint8x8_t* const needs_filter4_mask) {
+ // First half is |p0 - p1|, second half is |q0 - q1|.
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ // This includes cases where NeedsFilter4() is not true and so Filter2() will
+ // not be applied.
+ const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask =
+ NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
+
+ // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+ *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
+ const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
+ uint8x8_t* const p0q0_result) {
+ const int16x4_t zero = vdup_n_s16(0);
+
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
+ const int8x8_t hev_option =
+ vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a =
+ vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
+ const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
+ const int8x8_t a2_a1 =
+ vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
+
+ // a3 is in the high 4 values.
+ // a3 = (a1 + 1) >> 1;
+ const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
+
+ const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
+ const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
+
+ const int16x8_t p1q1_l =
+ vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
+
+ const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
+ const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
+
+ const int16x8_t p0q0_l =
+ vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
+ // Need to shift the second term or we end up with a2_ma2.
+ const int8x8_t a2_ma1 =
+ InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
+ const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
+
+ *p1q1_result = vqmovun_s16(p1q1_a3);
+ *p0q0_result = vqmovun_s16(p0q0_a);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+
+ uint8x8_t hev_mask;
+ uint8x8_t needs_filter4_mask;
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+ // Copy the masks to the high bits for packed comparisons later.
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+ needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+ const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
+
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+ StoreHi4(dst + stride, p1q1_output);
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ // Move |dst| to the left side of the filter window.
+ dst -= 2;
+
+ // |p1q0| and |p0q1| are named for the values they will contain after the
+ // transpose.
+ const uint8x8_t row0 = Load4(dst);
+ uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
+ const uint8x8_t row2 = Load4(dst + 2 * stride);
+ uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
+
+ Transpose4x4(&p1q0, &p0q1);
+ // Rearrange.
+ const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
+ const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
+ Transpose32(p1q1xq0p0.val[1])};
+
+ uint8x8_t hev_mask;
+ uint8x8_t needs_filter4_mask;
+ Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
+ inner_thresh, &hev_mask, &needs_filter4_mask);
+
+ // Copy the masks to the high bits for packed comparisons later.
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+ needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint8x8_t p0q0_output =
+ vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+ const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
+
+ // Put things back in order to reverse the transpose.
+ const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+ uint8x8_t output_0 = p1p0xq1q0.val[0],
+ output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+ Transpose4x4(&output_0, &output_1);
+
+ StoreLo4(dst, output_0);
+ StoreLo4(dst + stride, output_1);
+ StoreHi4(dst + 2 * stride, output_0);
+ StoreHi4(dst + 3 * stride, output_1);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p0p2_q0q2) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
+ return vand_u8(b, RightShiftVector<32>(b));
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// OuterThreshold()
+inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p1p2_q1q2,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, const uint8_t hev_thresh,
+ const uint8_t outer_thresh, const uint8_t inner_thresh,
+ uint8x8_t* const needs_filter6_mask,
+ uint8x8_t* const is_flat3_mask,
+ uint8x8_t* const hev_mask) {
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
+ *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
+ inner_thresh, outer_thresh);
+}
+
+inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
+ uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
+
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p1q1_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
+
+ *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+
+ uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+ needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+ is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter6_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+ // Filter6() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f6_p1q1 = zero;
+ f6_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ // Move |dst| to the left side of the filter window.
+ dst -= 3;
+
+ // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
+ // contain after the transpose.
+ // These over-read by 2 bytes. We only need 6.
+ uint8x8_t p2q1 = vld1_u8(dst);
+ uint8x8_t p1q2 = vld1_u8(dst + stride);
+ uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
+ uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
+
+ Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
+
+ const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+ const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+ const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+ const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
+
+ uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+ needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+ is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter6_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+ // Filter6() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f6_p1q1 = zero;
+ f6_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+
+ // The six tap filter is only six taps on input. Output is limited to p1-q1.
+ dst += 1;
+ // Put things back in order to reverse the transpose.
+ const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+ uint8x8_t output_0 = p1p0xq1q0.val[0];
+ uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+ Transpose4x4(&output_0, &output_1);
+
+ StoreLo4(dst, output_0);
+ StoreLo4(dst + stride, output_1);
+ StoreHi4(dst + 2 * stride, output_0);
+ StoreHi4(dst + 3 * stride, output_1);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
+ const uint8x8_t abd_p0n1_q0n1,
+ const uint8x8_t abd_p0n2_q0n2) {
+ const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
+ const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
+ const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
+ return vand_u8(c, RightShiftVector<32>(c));
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// OuterThreshold()
+inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
+ const uint8x8_t abd_p1p2_q1q2,
+ const uint8x8_t abd_p2p3_q2q3,
+ const uint8x8_t p0q0, const uint8x8_t p1q1,
+ const uint8_t inner_thresh,
+ const uint8_t outer_thresh) {
+ const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
+ const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
+ const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
+ const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+ return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
+ const uint8x8_t p1q1, const uint8x8_t p0q0,
+ const uint8_t hev_thresh, const uint8_t outer_thresh,
+ const uint8_t inner_thresh,
+ uint8x8_t* const needs_filter8_mask,
+ uint8x8_t* const is_flat4_mask,
+ uint8x8_t* const hev_mask) {
+ const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+ *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+ *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
+ *needs_filter8_mask =
+ NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
+ p1q1, inner_thresh, outer_thresh);
+}
+
+inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
+ const uint8x8_t p1q1, const uint8x8_t p0q0,
+ uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddl_u8(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddl_u8(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddw_u8(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p2q2_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(sum, vaddl_u8(p1q1, q1p1));
+
+ *p1q1_output = vrshrn_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
+ const uint8x8_t q2p2 = Transpose32(p2q2);
+ sum = vaddq_u16(sum, vaddl_u8(p0q0, q2p2));
+
+ *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p3_v = Load4(dst - 4 * stride);
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+ const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p2q2 = zero;
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+ const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+ StoreLo4(dst - 3 * stride, p2p2_output);
+ StoreHi4(dst + 2 * stride, p2p2_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ // Move |dst| to the left side of the filter window.
+ dst -= 4;
+
+ // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
+ // contain after the transpose.
+ uint8x8_t p3q0 = vld1_u8(dst);
+ uint8x8_t p2q1 = vld1_u8(dst + stride);
+ uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
+ uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
+
+ Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+ const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
+ const uint8x8_t p3q3 = p3q3xq0p0.val[0];
+ const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
+ const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+ const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+ const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p2q2 = zero;
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ // Always prepare and store p2/q2 because we need to transpose it anyway.
+ const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+ // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
+ // Variable names reflect the values before transposition.
+ const uint8x8x2_t p3q0xq3p0_output =
+ Interleave32(p3q3, Transpose32(p0q0_output));
+ uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
+ uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
+ const uint8x8x2_t p2q1xq2p1_output =
+ Interleave32(p2q2_output, Transpose32(p1q1_output));
+ uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
+ uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
+
+ Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
+
+ vst1_u8(dst, p3q0_output);
+ vst1_u8(dst + stride, p2q1_output);
+ vst1_u8(dst + 2 * stride, p1q2_output);
+ vst1_u8(dst + 3 * stride, p0q3_output);
+}
+
+inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
+ const uint8x8_t p4q4, const uint8x8_t p3q3,
+ const uint8x8_t p2q2, const uint8x8_t p1q1,
+ const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
+ uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
+ uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+ uint8x8_t* const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint8x8_t q0p0 = Transpose32(p0q0);
+ sum = vaddw_u8(sum, q0p0);
+
+ *p5q5_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
+ const uint8x8_t q1p1 = Transpose32(p1q1);
+ sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
+ const uint8x8_t q2p2 = Transpose32(p2q2);
+ sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
+ const uint8x8_t q3p3 = Transpose32(p3q3);
+ sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
+ const uint8x8_t q4p4 = Transpose32(p4q4);
+ sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrn_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
+ const uint8x8_t q5p5 = Transpose32(p5q5);
+ sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrn_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ const uint8x8_t p6_v = Load4(dst - 7 * stride);
+ const uint8x8_t p5_v = Load4(dst - 6 * stride);
+ const uint8x8_t p4_v = Load4(dst - 5 * stride);
+ const uint8x8_t p3_v = Load4(dst - 4 * stride);
+ const uint8x8_t p2_v = Load4(dst - 3 * stride);
+ const uint8x8_t p1_v = Load4(dst - 2 * stride);
+ const uint8x8_t p0_v = Load4(dst - stride);
+ const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+ const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+ const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+ const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+ const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
+ const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
+ const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Decide between Filter8() and Filter14().
+ uint8x8_t is_flat_outer4_mask =
+ IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+ is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+ is_flat_outer4_mask =
+ InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+ uint8x8_t f_p1q1;
+ uint8x8_t f_p0q0;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t f8_p1q1, f8_p0q0;
+ uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() and Filter14() do not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f8_p1q1 = zero;
+ f8_p0q0 = zero;
+ f14_p1q1 = zero;
+ f14_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f8_p2q2;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat_outer4_mask) == 0) {
+ // Filter14() does not apply.
+ const uint8x8_t zero = vdup_n_u8(0);
+ f14_p2q2 = zero;
+ f14_p1q1 = zero;
+ f14_p0q0 = zero;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+ const uint8x8_t p5q5_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+ StoreLo4(dst - 6 * stride, p5q5_output);
+ StoreHi4(dst + 5 * stride, p5q5_output);
+
+ const uint8x8_t p4q4_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+ StoreLo4(dst - 5 * stride, p4q4_output);
+ StoreHi4(dst + 4 * stride, p4q4_output);
+
+ const uint8x8_t p3q3_output =
+ vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+ StoreLo4(dst - 4 * stride, p3q3_output);
+ StoreHi4(dst + 3 * stride, p3q3_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+ StoreLo4(dst - 3 * stride, p2q2_output);
+ StoreHi4(dst + 2 * stride, p2q2_output);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ StoreLo4(dst - 2 * stride, p1q1_output);
+ StoreHi4(dst + stride, p1q1_output);
+
+ uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+ StoreLo4(dst - stride, p0q0_output);
+ StoreHi4(dst, p0q0_output);
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
+ const int outer_thresh, const int inner_thresh,
+ const int hev_thresh) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ dst -= 8;
+ // input
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ const uint8x16_t x0 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x1 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x2 = vld1q_u8(dst);
+ dst += stride;
+ const uint8x16_t x3 = vld1q_u8(dst);
+ dst -= (stride * 3);
+
+ // re-order input
+#if defined(__aarch64__)
+ const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+ const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+ const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
+
+ uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
+ uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
+ uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
+ uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
+#else
+ const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+ const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+
+ const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
+ const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
+ const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
+ const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
+
+ const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
+ const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
+ const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
+ const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
+
+ const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
+ const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
+ const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
+ const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
+#endif
+ // input after re-order
+ // p0 p1 p2 p3 q0 q1 q2 q3 p4 p5 p6 p7 q4 q5 q6 q7
+
+ const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
+ const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
+ const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
+ vreinterpretq_u16_u8(in23.val[0]));
+ const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
+ vreinterpretq_u16_u8(in23.val[1]));
+
+ const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
+ const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+ const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
+ const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+ const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
+ const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+ const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
+ const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+ uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+ &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+ needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+ is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+ is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+ hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(needs_filter8_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Decide between Filter8() and Filter14().
+ uint8x8_t is_flat_outer4_mask =
+ IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+ is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+ is_flat_outer4_mask =
+ InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+ uint8x8_t f_p0q0, f_p1q1;
+ const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+ Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+ // Reset the outer values if only a Hev() mask was required.
+ f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+ uint8x8_t p1q1_output, p0q0_output;
+ uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat4_mask) == 0) {
+ // Filter8() and Filter14() do not apply.
+ p1q1_output = p1q1;
+ p0q0_output = p0q0;
+
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+ if (vaddv_u8(is_flat_outer4_mask) == 0) {
+ // Filter14() does not apply.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = f8_p2q2;
+ p1q1_output = f8_p1q1;
+ p0q0_output = f8_p0q0;
+ } else {
+#endif // defined(__aarch64__)
+ uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+ p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+ p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+ p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+ p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+ p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+ p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+ p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+#if defined(__aarch64__)
+ }
+#endif // defined(__aarch64__)
+
+ p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+ p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+ p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+ p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+ const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
+ const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
+ const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
+ const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
+
+ const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
+ vreinterpretq_u16_u8(p2q2_p6q6));
+ const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
+ vreinterpretq_u16_u8(p3q3_p7q7));
+ const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
+ vreinterpretq_u8_u16(out13.val[0]));
+ const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
+ vreinterpretq_u8_u16(out13.val[1]));
+
+#if defined(__aarch64__)
+ const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+ const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+ const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
+
+ const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
+ const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
+ const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
+ const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
+#else
+ const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+ const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+
+ const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
+ const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
+ const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
+ const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
+
+ const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
+ const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
+ const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
+ const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
+
+ const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
+ const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
+ const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
+ const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
+#endif
+
+ vst1q_u8(dst, output_0);
+ dst += stride;
+ vst1q_u8(dst, output_1);
+ dst += stride;
+ vst1q_u8(dst, output_2);
+ dst += stride;
+ vst1q_u8(dst, output_3);
+}
+
+} // namespace
+
+void LoopFilterInit_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Horizontal4_NEON;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Horizontal6_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Horizontal8_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Vertical14_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_NEON();
+void LoopFilterInit10bpp_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+ const RestorationUnitInfo& restoration_info, const int direction,
+ int16_t filter[4]) {
+ for (int i = 0; i < 4; ++i) {
+ filter[i] = restoration_info.wiener_info.filter[direction][i];
+ }
+}
+
+inline int32x4x2_t WienerHorizontal2(const uint16x8_t s0, const uint16x8_t s1,
+ const int16_t filter,
+ const int32x4x2_t sum) {
+ const int16x8_t ss = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+ int32x4x2_t res;
+ res.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(ss), filter);
+ res.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(ss), filter);
+ return res;
+}
+
+inline void WienerHorizontalSum(const uint16x8_t s[3], const int16_t filter[4],
+ int32x4x2_t sum, int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (kBitdepth10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[2]));
+ const int16x8_t s_1 = vreinterpretq_s16_u16(s[1]);
+ int16x4x2_t sum16;
+ sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_0_2), filter[2]);
+ sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_1), filter[3]);
+ sum16.val[0] = vqshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal);
+ sum16.val[0] = vmax_s16(sum16.val[0], vdup_n_s16(-offset));
+ sum16.val[0] = vmin_s16(sum16.val[0], vdup_n_s16(limit - offset));
+ vst1_s16(wiener_buffer, sum16.val[0]);
+ sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_0_2), filter[2]);
+ sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_1), filter[3]);
+ sum16.val[1] = vqshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal);
+ sum16.val[1] = vmax_s16(sum16.val[1], vdup_n_s16(-offset));
+ sum16.val[1] = vmin_s16(sum16.val[1], vdup_n_s16(limit - offset));
+ vst1_s16(wiener_buffer + 4, sum16.val[1]);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t wiener_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ const ptrdiff_t src_width =
+ width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+ for (int y = height; y != 0; --y) {
+ const uint16_t* src_ptr = src;
+ uint16x8_t s[8];
+ s[0] = vld1q_u16(src_ptr);
+ ptrdiff_t x = wiener_stride;
+ ptrdiff_t valid_bytes = src_width * 2;
+ do {
+ src_ptr += 8;
+ valid_bytes -= 16;
+ s[7] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+ s[1] = vextq_u16(s[0], s[7], 1);
+ s[2] = vextq_u16(s[0], s[7], 2);
+ s[3] = vextq_u16(s[0], s[7], 3);
+ s[4] = vextq_u16(s[0], s[7], 4);
+ s[5] = vextq_u16(s[0], s[7], 5);
+ s[6] = vextq_u16(s[0], s[7], 6);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+ sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+ sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+ WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+ s[0] = s[7];
+ *wiener_buffer += 8;
+ x -= 8;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t wiener_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ const ptrdiff_t src_width =
+ width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+ for (int y = height; y != 0; --y) {
+ const uint16_t* src_ptr = src;
+ uint16x8_t s[6];
+ s[0] = vld1q_u16(src_ptr);
+ ptrdiff_t x = wiener_stride;
+ ptrdiff_t valid_bytes = src_width * 2;
+ do {
+ src_ptr += 8;
+ valid_bytes -= 16;
+ s[5] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+ s[1] = vextq_u16(s[0], s[5], 1);
+ s[2] = vextq_u16(s[0], s[5], 2);
+ s[3] = vextq_u16(s[0], s[5], 3);
+ s[4] = vextq_u16(s[0], s[5], 4);
+
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+ sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+ WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+ s[0] = s[5];
+ *wiener_buffer += 8;
+ x -= 8;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint16_t* src_ptr = src;
+ uint16x8_t s[3];
+ ptrdiff_t x = width;
+ do {
+ s[0] = vld1q_u16(src_ptr);
+ s[1] = vld1q_u16(src_ptr + 1);
+ s[2] = vld1q_u16(src_ptr + 2);
+
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] =
+ vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+ WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+ src_ptr += 8;
+ *wiener_buffer += 8;
+ x -= 8;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const uint16x8_t s = vld1q_u16(src + x);
+ const int16x8_t d = vreinterpretq_s16_u16(vshlq_n_u16(s, 4));
+ vst1q_s16(*wiener_buffer + x, d);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+ const int16_t filter,
+ const int32x4x2_t sum) {
+ int32x4x2_t d;
+ d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a0), filter);
+ d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a0), filter);
+ d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a1), filter);
+ d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a1), filter);
+ return d;
+}
+
+inline uint16x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+ const int32x4x2_t sum) {
+ int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+ d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+ d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+ const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+ const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+ return vcombine_u16(sum_lo_16, sum_hi_16);
+}
+
+inline uint16x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[7]) {
+ int32x4x2_t sum;
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[6], filter[0], sum);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap7Kernel2(
+ const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[8];
+ int32x4x2_t sum;
+ uint16x8x2_t d;
+ d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[7], filter[0], sum);
+ sum = WienerVertical2(a[2], a[6], filter[1], sum);
+ d.val[1] = WienerVertical(a + 3, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ for (int y = height >> 1; y != 0; --y) {
+ uint16_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint16x8x2_t d[2];
+ d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8 + dst_stride,
+ vminq_u16(d[1].val[1], v_max_bitdepth));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[7];
+ const uint16x8_t d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+ const uint16x8_t d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+ vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint16x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[5]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[4], filter[1], sum);
+ return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap5Kernel2(
+ const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[6];
+ int32x4x2_t sum;
+ uint16x8x2_t d;
+ d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ d.val[1] = WienerVertical(a + 2, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ for (int y = height >> 1; y != 0; --y) {
+ uint16_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint16x8x2_t d[2];
+ d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8 + dst_stride,
+ vminq_u16(d[1].val[1], v_max_bitdepth));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[5];
+ const uint16x8_t d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+ const uint16x8_t d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+ vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint16x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[3]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ return WienerVertical(a, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap3Kernel2(
+ const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[4];
+ int32x4x2_t sum;
+ uint16x8x2_t d;
+ d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ d.val[1] = WienerVertical(a + 1, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+
+ for (int y = height >> 1; y != 0; --y) {
+ uint16_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint16x8x2_t d[2];
+ d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+
+ vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+ vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+ vst1q_u16(dst_ptr + 8 + dst_stride,
+ vminq_u16(d[1].val[1], v_max_bitdepth));
+
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[3];
+ const uint16x8_t d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+ const uint16x8_t d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+ vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+ const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+ const int16x8_t d0 = vrshrq_n_s16(a0, 4);
+ const int16x8_t d1 = vrshrq_n_s16(a1, 4);
+ vst1q_u16(dst, vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d0, vdupq_n_s16(0))),
+ v_max_bitdepth));
+ vst1q_u16(dst + 8,
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d1, vdupq_n_s16(0))),
+ v_max_bitdepth));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint16_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+ WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst);
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+ int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+ int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+ filter_horizontal);
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+ filter_vertical);
+ // horizontal filtering.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, width, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, width, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, width,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, width, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, width, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, width,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ uint16x8_t dst[2]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ uint16x8_t dst[2]) {
+ dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ uint16x8_t dst[3]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+ dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ uint16x8_t dst[3]) {
+ dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = Load1QMsanU16(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4_t dst[2]) {
+ dst[0] = vld1q_u32(src + 0);
+ dst[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, uint32x4_t dst[2]) {
+ dst[0] = Load1QMsanU32(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = Load1QMsanU32(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ uint32x4_t dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ uint32x4_t dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ uint32x4_t dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ uint32x4_t dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+ vst1q_u16(dst + 0, src[0]);
+ vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4_t src[2]) {
+ vst1q_u32(dst + 0, src[0]);
+ vst1q_u32(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4_t src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+inline uint16x8_t VaddwLo8(const uint16x8_t src0, const uint8x16_t src1) {
+ const uint8x8_t s1 = vget_low_u8(src1);
+ return vaddw_u8(src0, s1);
+}
+
+inline uint16x8_t VaddwHi8(const uint16x8_t src0, const uint8x16_t src1) {
+ const uint8x8_t s1 = vget_high_u8(src1);
+ return vaddw_u8(src0, s1);
+}
+
+inline uint32x4_t VmullLo16(const uint16x8_t src0, const uint16x8_t src1) {
+ return vmull_u16(vget_low_u16(src0), vget_low_u16(src1));
+}
+
+inline uint32x4_t VmullHi16(const uint16x8_t src0, const uint16x8_t src1) {
+ return vmull_u16(vget_high_u16(src0), vget_high_u16(src1));
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+ return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+ return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+ return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+ return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+ return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+inline uint32x4_t Square(uint16x4_t s) { return vmull_u16(s, s); }
+
+inline void Square(const uint16x8_t src, uint32x4_t dst[2]) {
+ const uint16x4_t s_lo = vget_low_u16(src);
+ const uint16x4_t s_hi = vget_high_u16(src);
+ dst[0] = Square(s_lo);
+ dst[1] = Square(s_hi);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x8_t dst[3]) {
+ dst[0] = src[0];
+ dst[1] = vextq_u16(src[0], src[1], 1);
+ dst[2] = vextq_u16(src[0], src[1], 2);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+ dst[3] = VshrU128<offset + 3>(src);
+ dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x8_t dst[5]) {
+ dst[0] = src[0];
+ dst[1] = vextq_u16(src[0], src[1], 1);
+ dst[2] = vextq_u16(src[0], src[1], 2);
+ dst[3] = vextq_u16(src[0], src[1], 3);
+ dst[4] = vextq_u16(src[0], src[1], 4);
+}
+
+inline void Prepare3_32(const uint32x4_t src[2], uint32x4_t dst[3]) {
+ dst[0] = src[0];
+ dst[1] = vextq_u32(src[0], src[1], 1);
+ dst[2] = vextq_u32(src[0], src[1], 2);
+}
+
+inline void Prepare5_32(const uint32x4_t src[2], uint32x4_t dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = vextq_u32(src[0], src[1], 3);
+ dst[4] = src[1];
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+ const uint16x8_t src2) {
+ const uint16x8_t sum = vaddq_u16(src0, src1);
+ return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+ const uint32x4_t src2) {
+ const uint32x4_t sum = vaddq_u32(src0, src1);
+ return vaddq_u32(sum, src2);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const uint32x4_t src[3][2], uint32x4_t dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+ const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+ const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t* src0, const uint32x4_t* src1,
+ const uint32x4_t* src2, const uint32x4_t* src3,
+ const uint32x4_t* src4) {
+ const uint32x4_t sum01 = vaddq_u32(*src0, *src1);
+ const uint32x4_t sum23 = vaddq_u32(*src2, *src3);
+ const uint32x4_t sum = vaddq_u32(sum01, sum23);
+ return vaddq_u32(sum, *src4);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const uint32x4_t src[5][2], uint32x4_t dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline uint16x8_t Sum3Horizontal16(const uint16x8_t src[2]) {
+ uint16x8_t s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+ uint32x4_t s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline uint16x8_t Sum5Horizontal16(const uint16x8_t src[2]) {
+ uint16x8_t s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+ uint32x4_t s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const uint16x8_t src[2], uint16x8_t* const row3,
+ uint16x8_t* const row5) {
+ uint16x8_t s[5];
+ Prepare5_16(src, s);
+ const uint16x8_t sum04 = vaddq_u16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = vaddq_u16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16x8_t src[3], uint16x8_t* const row3_0,
+ uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+ uint16x8_t* const row5_1) {
+ SumHorizontal16(src + 0, row3_0, row5_0);
+ SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const uint32x4_t src[5], uint32x4_t* const row_sq3,
+ uint32x4_t* const row_sq5) {
+ const uint32x4_t sum04 = vaddq_u32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const uint32x4_t src[3],
+ uint32x4_t* const row_sq3_0,
+ uint32x4_t* const row_sq3_1,
+ uint32x4_t* const row_sq5_0,
+ uint32x4_t* const row_sq5_1) {
+ uint32x4_t s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline uint16x8_t Sum343Lo(const uint8x16_t ma3[3]) {
+ const uint16x8_t sum = Sum3WLo16(ma3);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline uint16x8_t Sum343Hi(const uint8x16_t ma3[3]) {
+ const uint16x8_t sum = Sum3WHi16(ma3);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline uint32x4_t Sum343(const uint32x4_t src[3]) {
+ const uint32x4_t sum = Sum3_32(src);
+ const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+ return vaddq_u32(sum3, src[1]);
+}
+
+inline void Sum343(const uint32x4_t src[3], uint32x4_t dst[2]) {
+ uint32x4_t s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline uint16x8_t Sum565Lo(const uint8x16_t src[3]) {
+ const uint16x8_t sum = Sum3WLo16(src);
+ const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+ const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline uint16x8_t Sum565Hi(const uint8x16_t src[3]) {
+ const uint16x8_t sum = Sum3WHi16(src);
+ const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+ const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline uint32x4_t Sum565(const uint32x4_t src[3]) {
+ const uint32x4_t sum = Sum3_32(src);
+ const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+ const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+ return vaddq_u32(sum5, src[1]);
+}
+
+inline void Sum565(const uint32x4_t src[3], uint32x4_t dst[2]) {
+ uint32x4_t s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ uint16x8_t s[3];
+ uint32x4_t sq[6];
+ s[0] = Load1QMsanU16(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ uint16x8_t row3[2], row5[2];
+ uint32x4_t row_sq3[2], row_sq5[2];
+ s[1] = Load1QMsanU16(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = Load1QMsanU16(src,
+ overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
+ int y = 2;
+ do {
+ uint16x8_t s[3];
+ uint32x4_t sq[6];
+ s[0] = Load1QMsanU16(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ uint16x8_t row[2];
+ uint32x4_t row_sq[4];
+ s[1] = Load1QMsanU16(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = Load1QMsanU16(src,
+ overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(s + 0);
+ row[1] = Sum3Horizontal16(s + 1);
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 2, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(s + 0);
+ row[1] = Sum5Horizontal16(s + 1);
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 2, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+ const uint32_t scale) {
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const uint32x4_t dxd = vmull_u16(sum, sum);
+ const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+ // Ensure |p| does not underflow by using saturating subtraction.
+ const uint32x4_t p = vqsubq_u32(axn, dxd);
+ const uint32x4_t pxs = vmulq_n_u32(p, scale);
+ // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+ // is 20.
+ const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+ return vmovn_u32(shifted);
+}
+
+template <int n>
+inline uint16x8_t CalculateMa(const uint16x8_t sum, const uint32x4_t sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const uint16x8_t b = vrshrq_n_u16(sum, 2);
+ const uint16x4_t sum_lo = vget_low_u16(b);
+ const uint16x4_t sum_hi = vget_high_u16(b);
+ const uint16x4_t z0 =
+ CalculateMa<n>(sum_lo, vrshrq_n_u32(sum_sq[0], 4), scale);
+ const uint16x4_t z1 =
+ CalculateMa<n>(sum_hi, vrshrq_n_u32(sum_sq[1], 4), scale);
+ return vcombine_u16(z0, z1);
+}
+
+inline void CalculateB5(const uint16x8_t sum, const uint16x8_t ma,
+ uint32x4_t b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const uint32x4_t m2 = VmullLo16(ma, sum);
+ const uint32x4_t m3 = VmullHi16(ma, sum);
+ const uint32x4_t m0 = vmulq_n_u32(m2, one_over_n_quarter);
+ const uint32x4_t m1 = vmulq_n_u32(m3, one_over_n_quarter);
+ b[0] = vrshrq_n_u32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = vrshrq_n_u32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const uint16x8_t sum, const uint16x8_t ma,
+ uint32x4_t b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const uint32x4_t m0 = VmullLo16(ma, sum);
+ const uint32x4_t m1 = VmullHi16(ma, sum);
+ const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+ const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+ b[0] = vrshrq_n_u32(m2, kSgrProjReciprocalBits);
+ b[1] = vrshrq_n_u32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex3(const uint16x8_t s3[3],
+ const uint32x4_t sq3[3][2],
+ const uint32_t scale, uint16x8_t* const sum,
+ uint16x8_t* const index) {
+ uint32x4_t sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const uint16x8_t s5[5],
+ const uint32x4_t sq5[5][2],
+ const uint32_t scale, uint16x8_t* const sum,
+ uint16x8_t* const index) {
+ uint32x4_t sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
+ uint8x16_t* const ma, uint32x4_t b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+
+ const uint8x8_t idx = vqmovn_u16(index);
+ uint8_t temp[8];
+ vst1_u8(temp, idx);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // duplicated to avoid -Wuninitialized warnings under gcc.
+ if (offset == 0) {
+ *ma = vdupq_n_u8(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+ }
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[4]], *ma, offset + 4);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[5]], *ma, offset + 5);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[6]], *ma, offset + 6);
+ *ma = vsetq_lane_u8(kSgrMaLookup[temp[7]], *ma, offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const uint16x8_t maq =
+ vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+ const int threshold) {
+ const uint8x8_t thresholds = vdup_n_u8(threshold);
+ const uint8x8_t offset = vcgt_u8(index, thresholds);
+ // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+ return vadd_u8(value, offset);
+}
+
+inline uint8x8_t MaLookupAndAdjust(const uint8x8x4_t table0,
+ const uint8x8x2_t table1,
+ const uint16x8_t index) {
+ const uint8x8_t idx = vqmovn_u16(index);
+ // All elements whose indices are out of range [0, 47] are set to 0.
+ uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31].
+ // Subtract 8 to shuffle the next index range.
+ const uint8x8_t sub_idx = vsub_u8(idx, vdup_n_u8(32));
+ const uint8x8_t res = vtbl2_u8(table1, sub_idx); // Range [32, 47].
+ // Use OR instruction to combine shuffle results together.
+ val = vorr_u8(val, res);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ val = vmax_u8(val, vdup_n_u8(5));
+ val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5.
+ val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4.
+ val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
+ val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
+ val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
+ return val;
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+ const uint16x8_t index[2],
+ uint8x16_t* const ma, uint32x4_t b0[2],
+ uint32x4_t b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+ // using two uint8x8x3_t vectors.
+ uint8x8x4_t table0;
+ uint8x8x2_t table1;
+ table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+ table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+ table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+ table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+ table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+ table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+ const uint8x8_t ma_lo = MaLookupAndAdjust(table0, table1, index[0]);
+ const uint8x8_t ma_hi = MaLookupAndAdjust(table0, table1, index[1]);
+ *ma = vcombine_u8(ma_lo, ma_hi);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const uint16x8_t maq0 = vmovl_u8(vget_low_u8(*ma));
+ CalculateB3(sum[0], maq0, b0);
+ const uint16x8_t maq1 = vmovl_u8(vget_high_u8(*ma));
+ CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+ const uint16x8_t index[2], uint8x16_t ma[2],
+ uint32x4_t b[4]) {
+ uint8x16_t mas;
+ CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+ ma[0] = vcombine_u8(vget_low_u8(ma[0]), vget_low_u8(mas));
+ ma[1] = vextq_u8(mas, vdupq_n_u8(0), 8);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+ const uint32x4_t sq5[5][2],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint32x4_t b[2]) {
+ static_assert(offset == 0 || offset == 8, "");
+ uint16x8_t sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+ const uint32x4_t sq3[3][2],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint32x4_t b[2]) {
+ uint16x8_t sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const uint32x4_t b3[3], const ptrdiff_t x,
+ uint32x4_t sum_b343[2], uint32x4_t sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ uint32x4_t b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = vshlq_n_u32(sum_b111[0], 2);
+ sum_b343[0] = vsubq_u32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = vaddq_u32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = vshlq_n_u32(sum_b111[1], 2);
+ sum_b343[1] = vsubq_u32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = vaddq_u32(sum_b343[1], b[1]);
+ StoreAligned32U32(b444 + x, sum_b444);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[3],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+ uint32x4_t sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const uint16x8_t sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+ vst1q_u16(ma444 + x, *sum_ma444);
+ const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ vst1q_u16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+ uint32x4_t sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const uint16x8_t sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+ vst1q_u16(ma444 + x, *sum_ma444);
+ const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ vst1q_u16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint32x4_t sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma444;
+ uint32x4_t sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint32x4_t sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma444;
+ uint32x4_t sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma343;
+ uint32x4_t sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma343;
+ uint32x4_t sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const uint16x8_t s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t* const ma,
+ uint32x4_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4_t sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ vst1q_u16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ vst1q_u16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16x8_t s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma[2],
+ uint32x4_t b[6]) {
+ uint16x8_t s5[2][5];
+ uint32x4_t sq5[5][2];
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ s5[0][3] = Sum5Horizontal16(s[0] + 1);
+ s5[1][3] = Sum5Horizontal16(s[0] + 2);
+ vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ s5[0][4] = Sum5Horizontal16(s[1] + 1);
+ s5[1][4] = Sum5Horizontal16(s[1] + 2);
+ vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ Sum5Horizontal32(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const uint16x8_t s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint32x4_t sq[4],
+ uint8x16_t* const ma, uint32x4_t b[2]) {
+ uint16x8_t s5[5];
+ uint32x4_t sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma[2],
+ uint32x4_t b[6]) {
+ uint16x8_t s5[2][5];
+ uint32x4_t sq5[5][2];
+ Square(s[2], sq + 4);
+ s5[0][3] = Sum5Horizontal16(s + 1);
+ s5[1][3] = Sum5Horizontal16(s + 2);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[3], sq + 6);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const uint16x8_t s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint32x4_t sq[4], uint8x16_t* const ma,
+ uint32x4_t b[2]) {
+ uint16x8_t s3[3];
+ uint32x4_t sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ vst1q_u16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16x8_t s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint32x4_t sq[8], uint8x16_t ma[2],
+ uint32x4_t b[6]) {
+ uint16x8_t s3[4], sum[2], index[2];
+ uint32x4_t sq3[3][2];
+
+ Square(s[2], sq + 4);
+ s3[2] = Sum3Horizontal16(s + 1);
+ s3[3] = Sum3Horizontal16(s + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const uint16x8_t s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+ uint32x4_t b3[2][6], uint8x16_t* const ma5, uint32x4_t b5[2]) {
+ uint16x8_t s3[4], s5[5], sum[2], index[2];
+ uint32x4_t sq3[4][2], sq5[5][2];
+
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u16(sum3[3], s3[3]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = vextq_u8(ma3[0][0], vdupq_n_u8(0), 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint16x8_t s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+ uint32x4_t b3[2][6], uint8x16_t ma5[2], uint32x4_t b5[6]) {
+ uint16x8_t s3[2][4], s5[2][5], sum[2][2], index[2][2];
+ uint32x4_t sq3[4][2], sq5[5][2];
+
+ SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ vst1q_u16(sum3[2] + x + 0, s3[0][2]);
+ vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+ vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ vst1q_u16(sum3[3] + x + 0, s3[0][3]);
+ vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+ vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const uint16x8_t s[2], const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint32x4_t sq[4], uint8x16_t* const ma3, uint8x16_t* const ma5,
+ uint32x4_t b3[2], uint32x4_t b5[2]) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4_t sq3[3][2], sq5[5][2];
+
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma3[2],
+ uint8x16_t ma5[2], uint32x4_t b3[6], uint32x4_t b5[6]) {
+ uint16x8_t s3[2][3], s5[2][5], sum[2], index[2];
+ uint32x4_t sq3[3][2], sq5[5][2];
+
+ Square(s[2], sq + 4);
+ SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[2][4];
+ uint8x16_t mas[2];
+ uint32x4_t sq[2][8], bs[6];
+
+ s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ uint8x16_t ma5[3];
+ uint16x8_t ma[2];
+ uint32x4_t b[4];
+
+ s[0][2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = Load1QMsanU16(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = Load1QMsanU16(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src) * width;
+ uint16x8_t s[4];
+ uint8x16_t mas[2];
+ uint32x4_t sq[8], bs[6];
+
+ s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0);
+ s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = Load1QMsanU16(src + x + 16,
+ overread_in_bytes + sizeof(*src) * (x + 16));
+ s[3] = Load1QMsanU16(src + x + 24,
+ overread_in_bytes + sizeof(*src) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ uint8x16_t ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ uint16x8_t ma[2];
+ uint32x4_t b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 2, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[2][4];
+ uint8x16_t ma3[2][2], ma5[2];
+ uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+ s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint32x4_t b[4];
+ uint8x16_t ma3x[3], ma5x[3];
+
+ s[0][2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = Load1QMsanU16(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = Load1QMsanU16(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343(b3[0] + 0, b + 0);
+ Sum343(b3[0] + 2, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565(b5 + 0, b + 0);
+ Sum565(b5 + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint32x4_t ma_x_src, const uint32x4_t b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const int32x4_t v = vreinterpretq_s32_u32(vsubq_u32(b, ma_x_src));
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return vqrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint16x8_t src,
+ const uint16x8_t ma,
+ const uint32x4_t b[2]) {
+ const uint32x4_t ma_x_src_lo = VmullLo16(ma, src);
+ const uint32x4_t ma_x_src_hi = VmullHi16(ma, src);
+ const int16x4_t dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const int16x4_t dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return vcombine_s16(dst_lo, dst_hi); // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint16x8_t src,
+ const uint16x8_t ma[2],
+ const uint32x4_t b[2][2]) {
+ const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+ uint32x4_t b_sum[2];
+ b_sum[0] = vaddq_u32(b[0][0], b[1][0]);
+ b_sum[1] = vaddq_u32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint16x8_t src,
+ const uint16x8_t ma[3],
+ const uint32x4_t b[3][2]) {
+ const uint16x8_t ma_sum = Sum3_16(ma);
+ uint32x4_t b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t SelfGuidedFinal(const uint16x8_t src, const int32x4_t v[2]) {
+ const int16x4_t v_lo =
+ vqrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x4_t v_hi =
+ vqrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+ return vaddq_s16(vreinterpretq_s16_u16(src), vv);
+}
+
+inline int16x8_t SelfGuidedDoubleMultiplier(const uint16x8_t src,
+ const int16x8_t filter[2],
+ const int w0, const int w2) {
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+ v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+ v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+ return SelfGuidedFinal(src, v);
+}
+
+inline int16x8_t SelfGuidedSingleMultiplier(const uint16x8_t src,
+ const int16x8_t filter,
+ const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const int16x8_t val) {
+ const uint16x8_t val0 = vreinterpretq_u16_s16(vmaxq_s16(val, vdupq_n_s16(0)));
+ const uint16x8_t val1 = vminq_u16(val0, vdupq_n_u16((1 << kBitdepth10) - 1));
+ vst1q_u16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[2][4];
+ uint8x16_t mas[2];
+ uint32x4_t sq[2][8], bs[6];
+
+ s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint32x4_t b[2][2];
+ uint8x16_t ma5[3];
+ int16x8_t p[2];
+
+ s[0][2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = Load1QMsanU16(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = Load1QMsanU16(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ vst1q_u16(ma565[1] + x, ma[1]);
+ Sum565(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ const uint16x8_t sr0_lo = vld1q_u16(src + x + 0);
+ const uint16x8_t sr1_lo = vld1q_u16(src + stride + x + 0);
+ ma[0] = vld1q_u16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const int16x8_t d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const int16x8_t d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma5);
+ vst1q_u16(ma565[1] + x + 8, ma[1]);
+ Sum565(bs + 2, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const uint16x8_t sr0_hi = vld1q_u16(src + x + 8);
+ const uint16x8_t sr1_hi = vld1q_u16(src + stride + x + 8);
+ ma[0] = vld1q_u16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const int16x8_t d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ const int16x8_t d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[4];
+ uint8x16_t mas[2];
+ uint32x4_t sq[8], bs[6];
+
+ s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint32x4_t b[2][2];
+ uint8x16_t ma5[3];
+
+ s[2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565(bs, b[1]);
+ ma[0] = vld1q_u16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+ int16x8_t p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565(bs + 2, b[1]);
+ ma[0] = vld1q_u16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src0) * width;
+ uint16x8_t s[4];
+ uint8x16_t mas[2];
+ uint32x4_t sq[8], bs[6];
+
+ s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ uint16x8_t ma[3];
+ uint32x4_t b[3][2];
+ uint8x16_t ma3[3];
+
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+ ma[0] = vld1q_u16(ma343[0] + x);
+ ma[1] = vld1q_u16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const int16x8_t p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+ ma[0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1] = vld1q_u16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const int16x8_t p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[2][4];
+ uint8x16_t ma3[2][2], ma5[2];
+ uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+ s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[3][3];
+ uint32x4_t b[3][3][2];
+ uint8x16_t ma3x[2][3], ma5x[3];
+ int16x8_t p[2][2];
+
+ s[0][2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = Load1QMsanU16(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = Load1QMsanU16(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ vst1q_u16(ma565[1] + x, ma[0][1]);
+ Sum565(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const uint16x8_t sr0_lo = vld1q_u16(src + x);
+ const uint16x8_t sr1_lo = vld1q_u16(src + stride + x);
+ ma[0][0] = vld1q_u16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x);
+ ma[1][1] = vld1q_u16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const int16x8_t d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = vld1q_u16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const int16x8_t d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+ Sum565(b5 + 2, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const uint16x8_t sr0_hi = Load1QMsanU16(
+ src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+ const uint16x8_t sr1_hi = Load1QMsanU16(
+ src + stride + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+ ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const int16x8_t d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const int16x8_t d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ uint16x8_t s[4];
+ uint8x16_t ma3[2], ma5[2];
+ uint32x4_t sq[8], b3[6], b5[6];
+ uint16x8_t ma[3];
+ uint32x4_t b[3][2];
+
+ s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+ s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], b3, b5);
+
+ int x = 0;
+ do {
+ uint8x16_t ma3x[3], ma5x[3];
+ int16x8_t p[2];
+
+ s[2] = Load1QMsanU16(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = Load1QMsanU16(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+ ma[0] = vld1q_u16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = vld1q_u16(ma343 + x);
+ ma[1] = vld1q_u16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const int16x8_t d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 2, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 2, b[2]);
+ const uint16x8_t sr_hi = Load1QMsanU16(
+ src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+ ma[0] = vld1q_u16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = vld1q_u16(ma343 + x + 8);
+ ma[1] = vld1q_u16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const int16x8_t d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[4];
+ b3[1] = b3[5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* top = static_cast<const uint16_t*>(top_border);
+ const auto* bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->loop_restorations[0] = WienerFilter_NEON;
+ dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+ return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+ return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+ return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+ return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+ return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+ const RestorationUnitInfo& restoration_info, const int direction,
+ int16_t filter[4]) {
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ for (int i = 0; i < 4; ++i) {
+ filter[i] = restoration_info.wiener_info.filter[direction][i];
+ }
+ if (direction == WienerInfo::kHorizontal) {
+ filter[3] -= 128;
+ }
+}
+
+inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1,
+ const int16_t filter, const int16x8_t sum) {
+ const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1));
+ return vmlaq_n_s16(sum, ss, filter);
+}
+
+inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1,
+ const int16_t filter,
+ const int16x8x2_t sum) {
+ int16x8x2_t d;
+ d.val[0] =
+ WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]);
+ d.val[1] =
+ WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]);
+ return d;
+}
+
+inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4],
+ int16x8_t sum, int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2]));
+ const int16x8_t s_1 = ZeroExtend(s[1]);
+ sum = vmlaq_n_s16(sum, s_0_2, filter[2]);
+ sum = vmlaq_n_s16(sum, s_1, filter[3]);
+ // Calculate scaled down offset correction, and add to sum here to prevent
+ // signed 16 bit outranging.
+ sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum,
+ kInterRoundBitsHorizontal);
+ sum = vmaxq_s16(sum, vdupq_n_s16(-offset));
+ sum = vminq_s16(sum, vdupq_n_s16(limit - offset));
+ vst1q_s16(wiener_buffer, sum);
+}
+
+inline void WienerHorizontalSum(const uint8x16_t src[3],
+ const int16_t filter[4], int16x8x2_t sum,
+ int16_t* const wiener_buffer) {
+ uint8x8_t s[3];
+ s[0] = vget_low_u8(src[0]);
+ s[1] = vget_low_u8(src[1]);
+ s[2] = vget_low_u8(src[2]);
+ WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer);
+ s[0] = vget_high_u8(src[0]);
+ s[1] = vget_high_u8(src[1]);
+ s[2] = vget_high_u8(src[2]);
+ WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[8];
+ s[0] = vld1q_u8(src_ptr);
+ ptrdiff_t x = width;
+ do {
+ src_ptr += 16;
+ s[7] = vld1q_u8(src_ptr);
+ s[1] = vextq_u8(s[0], s[7], 1);
+ s[2] = vextq_u8(s[0], s[7], 2);
+ s[3] = vextq_u8(s[0], s[7], 3);
+ s[4] = vextq_u8(s[0], s[7], 4);
+ s[5] = vextq_u8(s[0], s[7], 5);
+ s[6] = vextq_u8(s[0], s[7], 6);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+ sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+ WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+ s[0] = s[7];
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[6];
+ s[0] = vld1q_u8(src_ptr);
+ ptrdiff_t x = width;
+ do {
+ src_ptr += 16;
+ s[5] = vld1q_u8(src_ptr);
+ s[1] = vextq_u8(s[0], s[5], 1);
+ s[2] = vextq_u8(s[0], s[5], 2);
+ s[3] = vextq_u8(s[0], s[5], 3);
+ s[4] = vextq_u8(s[0], s[5], 4);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+ WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+ s[0] = s[5];
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4],
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ uint8x16_t s[3];
+ ptrdiff_t x = width;
+ do {
+ // Slightly faster than using vextq_u8().
+ s[0] = vld1q_u8(src_ptr);
+ s[1] = vld1q_u8(src_ptr + 1);
+ s[2] = vld1q_u8(src_ptr + 2);
+ int16x8x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+ WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+ src_ptr += 16;
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ const uint8_t* src_ptr = src;
+ ptrdiff_t x = width;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x8_t s0 = vget_low_u8(s);
+ const uint8x8_t s1 = vget_high_u8(s);
+ const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4));
+ const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4));
+ vst1q_s16(*wiener_buffer + 0, d0);
+ vst1q_s16(*wiener_buffer + 8, d1);
+ src_ptr += 16;
+ *wiener_buffer += 16;
+ x -= 16;
+ } while (x != 0);
+ src += src_stride;
+ }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+ const int16_t filter,
+ const int32x4x2_t sum) {
+ const int16x8_t a = vaddq_s16(a0, a1);
+ int32x4x2_t d;
+ d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter);
+ d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter);
+ return d;
+}
+
+inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+ const int32x4x2_t sum) {
+ int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+ d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+ d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+ const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+ const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+ return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
+}
+
+inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[7]) {
+ int32x4x2_t sum;
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[6], filter[0], sum);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[8];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[7], filter[0], sum);
+ sum = WienerVertical2(a[2], a[6], filter[1], sum);
+ d.val[1] = WienerVertical(a + 3, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[7];
+ const uint8x8_t d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[5]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[0], a[4], filter[1], sum);
+ return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[6];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ sum = WienerVertical2(a[1], a[5], filter[1], sum);
+ d.val[1] = WienerVertical(a + 2, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[5];
+ const uint8x8_t d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4],
+ int16x8_t a[3]) {
+ a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+ a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+ a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+ int32x4x2_t sum;
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ return WienerVertical(a, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const int16_t filter[4]) {
+ int16x8_t a[4];
+ int32x4x2_t sum;
+ uint8x8x2_t d;
+ d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+ sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+ d.val[1] = WienerVertical(a + 1, filter, sum);
+ return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t filter[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ uint8x8x2_t d[2];
+ d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+ d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+ vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ int16x8_t a[3];
+ const uint8x8_t d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+ const uint8x8_t d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+ const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+ const uint8x8_t d0 = vqrshrun_n_s16(a0, 4);
+ const uint8x8_t d1 = vqrshrun_n_s16(a1, 4);
+ vst1q_u8(dst, vcombine_u8(d0, d1));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y != 0; --y) {
+ uint8_t* dst_ptr = dst;
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+ WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+ wiener_buffer += 16;
+ dst_ptr += 16;
+ x -= 16;
+ } while (x != 0);
+ wiener_buffer += width;
+ dst += 2 * dst_stride;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = width;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer, dst);
+ wiener_buffer += 16;
+ dst += 16;
+ x -= 16;
+ } while (x != 0);
+ }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+ int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+ int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+ filter_horizontal);
+ PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+ filter_vertical);
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 2;
+constexpr int kOverreadInBytesPass2 = 4;
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kWideOverreadInBytesPass1 = 10;
+constexpr int kWideOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ uint16x8_t dst[2]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ uint16x8_t dst[3]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+ dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4x2_t* dst) {
+ (*dst).val[0] = vld1q_u32(src + 0);
+ (*dst).val[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ uint32x4x2_t dst[2]) {
+ LoadAligned32U32(src[0] + x, &dst[0]);
+ LoadAligned32U32(src[1] + x, &dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ uint32x4x2_t dst[3]) {
+ LoadAligned32U32(src[0] + x, &dst[0]);
+ LoadAligned32U32(src[1] + x, &dst[1]);
+ LoadAligned32U32(src[2] + x, &dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+ vst1q_u16(dst + 0, src[0]);
+ vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4x2_t src) {
+ vst1q_u32(dst + 0, src.val[0]);
+ vst1q_u32(dst + 4, src.val[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4x2_t src[2]) {
+ vst1q_u32(dst + 0, src[0].val[0]);
+ vst1q_u32(dst + 4, src[0].val[1]);
+ vst1q_u32(dst + 8, src[1].val[0]);
+ vst1q_u32(dst + 12, src[1].val[1]);
+}
+
+inline uint16x8_t SquareLo8(const uint8x8_t src) { return vmull_u8(src, src); }
+
+inline uint16x8_t SquareLo8(const uint8x16_t src) {
+ return vmull_u8(vget_low_u8(src), vget_low_u8(src));
+}
+
+inline uint16x8_t SquareHi8(const uint8x16_t src) {
+ return vmull_u8(vget_high_u8(src), vget_high_u8(src));
+}
+
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
+ dst[0] = VshrU128<0>(src);
+ dst[1] = VshrU128<1>(src);
+ dst[2] = VshrU128<2>(src);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
+ uint16x4_t high[3]) {
+ uint16x8_t s[3];
+ s[0] = VshrU128<0>(src);
+ s[1] = VshrU128<2>(src);
+ s[2] = VshrU128<4>(src);
+ low[0] = vget_low_u16(s[0]);
+ low[1] = vget_low_u16(s[1]);
+ low[2] = vget_low_u16(s[2]);
+ high[0] = vget_high_u16(s[0]);
+ high[1] = vget_high_u16(s[1]);
+ high[2] = vget_high_u16(s[2]);
+}
+
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
+ dst[0] = VshrU128<0>(src);
+ dst[1] = VshrU128<1>(src);
+ dst[2] = VshrU128<2>(src);
+ dst[3] = VshrU128<3>(src);
+ dst[4] = VshrU128<4>(src);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+ dst[3] = VshrU128<offset + 3>(src);
+ dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
+ uint16x4_t high[5]) {
+ Prepare3_16(src, low, high);
+ const uint16x8_t s3 = VshrU128<6>(src);
+ const uint16x8_t s4 = VshrU128<8>(src);
+ low[3] = vget_low_u16(s3);
+ low[4] = vget_low_u16(s4);
+ high[3] = vget_high_u16(s3);
+ high[4] = vget_high_u16(s4);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+ const uint16x8_t src2) {
+ const uint16x8_t sum = vaddq_u16(src0, src1);
+ return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+ const uint32x4_t src2) {
+ const uint32x4_t sum = vaddq_u32(src0, src1);
+ return vaddq_u32(sum, src2);
+}
+
+inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) {
+ uint32x4x2_t d;
+ d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]);
+ d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]);
+ return d;
+}
+
+inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(src[0], src[1]);
+ return vaddw_u8(sum, src[2]);
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
+inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
+ const uint32x4_t sum = vaddl_u16(src[0], src[1]);
+ return vaddw_u16(sum, src[2]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+ const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+ const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1,
+ const uint32x4_t src2, const uint32x4_t src3,
+ const uint32x4_t src4) {
+ const uint32x4_t sum01 = vaddq_u32(src0, src1);
+ const uint32x4_t sum23 = vaddq_u32(src2, src3);
+ const uint32x4_t sum = vaddq_u32(sum01, sum23);
+ return vaddq_u32(sum, src4);
+}
+
+inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) {
+ uint32x4x2_t d;
+ d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0],
+ src[4].val[0]);
+ d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1],
+ src[4].val[1]);
+ return d;
+}
+
+inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
+ const uint32x4_t sum01 = vaddl_u16(src[0], src[1]);
+ const uint32x4_t sum23 = vaddl_u16(src[2], src[3]);
+ const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
+ return vaddw_u16(sum0123, src[4]);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
+ uint8x8_t s[3];
+ Prepare3_8(src, s);
+ return Sum3W_16(s);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+ uint8x16_t s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t sum;
+ Prepare3_16(src, low, high);
+ sum.val[0] = Sum3W_32(low);
+ sum.val[1] = Sum3W_32(high);
+ return sum;
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
+ uint8x8_t s[5];
+ Prepare5_8(src, s);
+ const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
+ const uint16x8_t sum23 = vaddl_u8(s[2], s[3]);
+ const uint16x8_t sum0123 = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum0123, s[4]);
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+ uint16x8_t* const dst1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
+ uint16x4_t low[5], high[5];
+ Prepare5_16(src, low, high);
+ uint32x4x2_t sum;
+ sum.val[0] = Sum5W_32(low);
+ sum.val[1] = Sum5W_32(high);
+ return sum;
+}
+
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+ uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+ uint16x8_t* const row5_1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+ const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+ *row5_1 = vaddq_u16(sum04_hi, *row3_1);
+}
+
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+ uint16x8_t* const row5) {
+ uint8x8_t s[5];
+ Prepare5_8(src, s);
+ const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+ const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+ *row3 = vaddw_u8(sum12, s[3]);
+ *row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+ uint32x4_t* const row_sq5) {
+ const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+ const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+ *row_sq3 = vaddw_u16(sum12, src[3]);
+ *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+ uint32x4x2_t* const row_sq5) {
+ uint16x4_t low[5], high[5];
+ Prepare5_16(sq, low, high);
+ SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
+ SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
+}
+
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ SumHorizontal(src, row3, row5);
+ SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+ const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+ return vaddw_u8(sum3,
+ (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+}
+
+inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
+ const uint32x4_t sum = Sum3W_32(src);
+ const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+ return vaddw_u16(sum3, src[1]);
+}
+
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t d;
+ Prepare3_16(src, low, high);
+ d.val[0] = Sum343W(low);
+ d.val[1] = Sum343W(high);
+ return d;
+}
+
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
+ const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+ const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+ return vaddw_u8(sum5,
+ (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
+}
+
+inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
+ const uint32x4_t sum = Sum3W_32(src);
+ const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+ const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+ return vaddw_u16(sum5, src[1]);
+}
+
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t d;
+ Prepare3_16(src, low, high);
+ d.val[0] = Sum565W(low);
+ d.val[1] = Sum565W(high);
+ return d;
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes = kOverreadInBytesPass1 - width;
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
+ do {
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = Load1MsanU8(src, overread_in_bytes);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ uint16x8_t row3, row5;
+ uint32x4x2_t row_sq3, row_sq5;
+ x -= 8;
+ src += 8;
+ s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+ sq[1] = SquareLo8(s[1]);
+ SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+ vst1q_u16(sum3, row3);
+ vst1q_u16(sum5, row5);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ s[0] = s[1];
+ sq[0] = sq[1];
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
+ do {
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = Load1MsanU8(src, overread_in_bytes);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ uint16x8_t row;
+ uint32x4x2_t row_sq;
+ x -= 8;
+ src += 8;
+ s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+ sq[1] = SquareLo8(s[1]);
+ if (size == 3) {
+ row = Sum3Horizontal(s);
+ row_sq = Sum3WHorizontal(sq);
+ } else {
+ row = Sum5Horizontal(s);
+ row_sq = Sum5WHorizontal(sq);
+ }
+ vst1q_u16(sums, row);
+ StoreAligned32U32(square_sums, row_sq);
+ s[0] = s[1];
+ sq[0] = sq[1];
+ sums += 8;
+ square_sums += 8;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+ const uint32_t scale) {
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const uint32x4_t dxd = vmull_u16(sum, sum);
+ const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+ // Ensure |p| does not underflow by using saturating subtraction.
+ const uint32x4_t p = vqsubq_u32(axn, dxd);
+ const uint32x4_t pxs = vmulq_n_u32(p, scale);
+ // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+ // is 20.
+ const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+ return vmovn_u32(shifted);
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+ const int threshold) {
+ const uint8x8_t thresholds = vdup_n_u8(threshold);
+ const uint8x8_t offset = vcgt_u8(index, thresholds);
+ // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+ return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
+inline void CalculateIntermediate(const uint16x8_t sum,
+ const uint32x4x2_t sum_sq,
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+ const uint16x4_t z1 =
+ CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+ const uint16x8_t z01 = vcombine_u16(z0, z1);
+ const uint8x8_t idx = vqmovn_u16(z01);
+ // Use table lookup to read elements whose indices are less than 48.
+ // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+ // using two uint8x8x3_t vectors.
+ uint8x8x4_t table0;
+ uint8x8x2_t table1;
+ table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+ table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+ table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+ table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+ table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+ table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+ // All elements whose indices are out of range [0, 47] are set to 0.
+ uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31].
+ // Subtract 8 to shuffle the next index range.
+ const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+ const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47].
+ // Use OR instruction to combine shuffle results together.
+ val = vorr_u8(val, res);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ val = vmax_u8(val, vdup_n_u8(5));
+ val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5.
+ val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4.
+ val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
+ val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
+ val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
+ // offset == 0 is assumed to be the first call to this function. Note
+ // vget_high_u8(*ma) is not used in this case to avoid a -Wuninitialized
+ // warning with some versions of gcc. vdup_n_u8(0) could work as well, but in
+ // most cases clang and gcc generated better code with this version.
+ *ma = (offset == 0) ? vcombine_u8(val, val)
+ : vcombine_u8(vget_low_u8(*ma), val);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const uint16x8_t maq =
+ vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+ const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+ const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+ const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+ const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+ const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+ const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+ *b = vcombine_u16(b_lo, b_hi);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+ const uint32x4x2_t sq5[5],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum5_16(s5);
+ const uint32x4x2_t sum_sq = Sum5_32(sq5);
+ CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+ const uint32x4x2_t sq3[3],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum3_16(s3);
+ const uint32x4x2_t sum_sq = Sum3_32(sq3);
+ CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint16x8_t* const sum_ma444,
+ uint32x4x2_t* const sum_b343,
+ uint32x4x2_t* const sum_b444, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+ *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+ const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+ *sum_ma343 = vaddw_u8(
+ sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+ uint16x4_t low[3], high[3];
+ uint32x4x2_t sum_b111;
+ Prepare3_16(b3, low, high);
+ sum_b111.val[0] = Sum3W_32(low);
+ sum_b111.val[1] = Sum3W_32(high);
+ sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2);
+ sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2);
+ sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]);
+ sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]);
+ sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]);
+ sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
+ vst1q_u16(ma343 + x, *sum_ma343);
+ vst1q_u16(ma444 + x, *sum_ma444);
+ vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+ vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+ vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+ vst1q_u32(b444 + x + 4, sum_b444->val[1]);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+ const ptrdiff_t x, uint16x8_t* const sum_ma343,
+ uint32x4x2_t* const sum_b343, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma444;
+ uint32x4x2_t sum_b444;
+ Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+ ma343, ma444, b343, b444);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ uint16x8_t sum_ma343;
+ uint32x4x2_t sum_b343;
+ Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+ b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ uint8x16_t s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[3] = Sum5Horizontal(s[0][0]);
+ s5[4] = Sum5Horizontal(s[1][0]);
+ sq5[3] = Sum5WHorizontal(sq[0]);
+ sq5[4] = Sum5WHorizontal(sq[1]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ uint8x16_t s[2][2], const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ sq5[3] = Sum5WHorizontal(sq[0] + 1);
+ sq5[4] = Sum5WHorizontal(sq[1] + 1);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ sq5[3] = Sum5WHorizontal(sq[0] + 2);
+ sq5[4] = Sum5WHorizontal(sq[1] + 2);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ uint8x16_t* const s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint16x8_t sq[2],
+ uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ sq[0] = SquareLo8(s[0]);
+ sq[1] = SquareHi8(s[0]);
+ s5[3] = s5[4] = Sum5Horizontal(*s);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ sq[1] = SquareLo8(s[1]);
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[2] = SquareHi8(s[1]);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ uint8x16_t* const s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint16x8_t sq[2], uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ uint16x8_t s3[3];
+ uint32x4x2_t sq3[3];
+ sq[0] = SquareLo8(*s);
+ sq[1] = SquareHi8(*s);
+ s3[2] = Sum3Horizontal(*s);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2], s3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[3],
+ uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s3[4];
+ uint32x4x2_t sq3[3];
+ sq[1] = SquareLo8(s[1]);
+ Sum3Horizontal<8>(s, s3 + 2);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2] + x, s3[2]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+ sq[2] = SquareHi8(s[1]);
+ sq3[2] = Sum3WHorizontal(sq + 1);
+ vst1q_u16(sum3[2] + x + 8, s3[3]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16(sum3, x + 8, s3 + 1);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
+ CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ uint8x16_t s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+ uint16x8_t s3[4], s5[5];
+ uint32x4x2_t sq3[4], sq5[5];
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u16(sum3[3], s3[3]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+ CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8x16_t s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t ma3[2][2], uint16x8_t b3[2][3],
+ uint8x16_t ma5[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][4], s5[2][5];
+ uint32x4x2_t sq3[4], sq5[5];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x, s3[0][2]);
+ vst1q_u16(sum3[3] + x, s3[0][3]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+ CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+ &b3[1][1]);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+ vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16(sum3, x + 8, s3[1]);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+ CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+ &b3[1][2]);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ uint8x16_t* const s, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[2], uint8x16_t* const ma3, uint8x16_t* const ma5,
+ uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4x2_t sq3[3], sq5[5];
+ sq[0] = SquareLo8(s[0]);
+ sq[1] = SquareHi8(s[0]);
+ SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ uint8x16_t s[2], const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], uint16x8_t b3[2],
+ uint16x8_t b5[2]) {
+ uint16x8_t s3[2][3], s5[2][5];
+ uint32x4x2_t sq3[3], sq5[5];
+ sq[1] = SquareLo8(s[1]);
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq, &sq3[2], &sq5[3]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+ sq[2] = SquareHi8(s[1]);
+ SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+ LoadAligned16x2U16(sum3, x + 8, s3[1]);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ uint16_t* ma565, uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[0] = Sum565<0>(masx);
+ b[0] = Sum565W(bs);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+ uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ s[0] = Load1QMsanU8(src, overread_in_bytes);
+ BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ uint8x16_t ma3x[3];
+ s[1] = Load1QMsanU8(src + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ if (calculate444) {
+ Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+ b444 + 8);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[0] = Sum343<0>(ma3x);
+ b[0] = Sum343W(bs);
+ vst1q_u16(ma343, ma[0]);
+ vst1q_u32(b343 + 0, b[0].val[0]);
+ vst1q_u32(b343 + 4, b[0].val[1]);
+ ma[1] = Sum343<8>(ma3x);
+ b[1] = Sum343W(bs + 1);
+ vst1q_u16(ma343 + 8, ma[1]);
+ vst1q_u32(b343 + 8, b[1].val[0]);
+ vst1q_u32(b343 + 12, b[1].val[1]);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+ uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t ma3x[3], ma5x[3];
+ uint32x4x2_t b[2];
+
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343<0>(ma3x);
+ ma[1] = Sum343<8>(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ b[0] = Sum343W(b3[0] + 0);
+ b[1] = Sum343W(b3[0] + 1);
+ StoreAligned64U32(b343[0] + x, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565<0>(ma5x);
+ ma[1] = Sum565<8>(ma5x);
+ StoreAligned32U16(ma565, ma);
+ b[0] = Sum565W(b5);
+ b[1] = Sum565W(b5 + 1);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ ma5[0] = ma5[1];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma,
+ const uint32x4_t b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src));
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+ const uint16x8_t ma,
+ const uint32x4x2_t b) {
+ const uint16x8_t src_u16 = vmovl_u8(src);
+ const int16x4_t dst_lo =
+ FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]);
+ const int16x4_t dst_hi =
+ FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]);
+ return vcombine_s16(dst_lo, dst_hi); // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s,
+ uint16x8_t ma[2],
+ uint32x4x2_t b[2]) {
+ const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+ uint32x4x2_t b_sum;
+ b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]);
+ b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]);
+ return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
+ uint16x8_t ma[3],
+ uint32x4x2_t b[3]) {
+ const uint16x8_t ma_sum = Sum3_16(ma);
+ const uint32x4x2_t b_sum = Sum3_32(b);
+ return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
+ const int16x4_t v_lo =
+ vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x4_t v_hi =
+ vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+ const int16x8_t d =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+ return vqmovun_s16(d);
+}
+
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+ const int16x8_t filter[2],
+ const int w0, const int w2) {
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+ v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+ v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+ return SelfGuidedFinal(src, v);
+}
+
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+ const int16x8_t filter,
+ const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ int32x4_t v[2];
+ v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+ v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+ const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+ uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ s[0][0] = Load1QMsanU8(src0, overread_in_bytes);
+ s[1][0] = Load1QMsanU8(src1, overread_in_bytes);
+
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ int16x8_t p0, p1;
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
+ b[1] = Sum565W(bs);
+ vst1q_u16(ma565[1] + x, ma[1]);
+ vst1q_u32(b565[1] + x + 0, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 4, b[1].val[1]);
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
+ ma[0] = vld1q_u16(ma565[0] + x);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+ const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+ const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[1]);
+ vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0] = vld1q_u16(ma565[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+ const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+ const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+ const uint8_t* const src0, const int width,
+ const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ uint16_t* ma565, uint32_t* b565,
+ uint8_t* const dst) {
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[4];
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ s[1] = vld1q_u8(src0 + x + 16);
+
+ BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
+ b[1] = Sum565W(bs);
+ ma[0] = vld1q_u16(ma565);
+ b[0].val[0] = vld1q_u32(b565 + 0);
+ b[0].val[1] = vld1q_u32(b565 + 4);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ bs[0] = bs[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + 8);
+ b[0].val[0] = vld1q_u32(b565 + 8);
+ b[0].val[1] = vld1q_u32(b565 + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+ uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+ uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[3];
+ uint8x16_t ma3x[3];
+ uint32x4x2_t b[3];
+ s[1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ ma[0] = vld1q_u16(ma343[0] + x);
+ ma[1] = vld1q_u16(ma444[0] + x);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+ const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1] = vld1q_u16(ma444[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[3][3];
+ uint8x16_t ma3x[2][3], ma5x[3];
+ uint32x4x2_t b[3][3];
+ int16x8_t p[2][2];
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0][1] = Sum565<0>(ma5x);
+ b[0][1] = Sum565W(b5);
+ vst1q_u16(ma565[1] + x, ma[0][1]);
+ vst1q_u32(b565[1] + x, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x);
+ ma[1][1] = vld1q_u16(ma444[0] + x);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+ p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+ p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+ const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+ const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+ Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+ &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565<8>(ma5x);
+ b[0][1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+ vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+ p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+ p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+ const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+ const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ uint8x16_t s[2], ma3[2], ma5[2];
+ uint16x8_t sq[4], ma[3], b3[3], b5[3];
+ uint32x4x2_t b[3];
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], &b3[0], &b5[0]);
+
+ int x = 0;
+ do {
+ uint8x16_t ma3x[3], ma5x[3];
+ int16x8_t p[2];
+ s[1] = vld1q_u8(src0 + x + 16);
+
+ BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, sq + 1, ma3, ma5, &b3[1], &b5[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565<0>(ma5x);
+ b[1] = Sum565W(b5);
+ Prepare3_8<0>(ma3, ma3x);
+ ma[2] = Sum343<0>(ma3x);
+ b[2] = Sum343W(b3);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x);
+ b[0].val[0] = vld1q_u32(b565 + x + 0);
+ b[0].val[1] = vld1q_u32(b565 + x + 4);
+ p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+ ma[0] = vld1q_u16(ma343 + x);
+ ma[1] = vld1q_u16(ma444 + x);
+ b[0].val[0] = vld1q_u32(b343 + x + 0);
+ b[0].val[1] = vld1q_u32(b343 + x + 4);
+ b[1].val[0] = vld1q_u32(b444 + x + 0);
+ b[1].val[1] = vld1q_u32(b444 + x + 4);
+ p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+ ma[1] = Sum565<8>(ma5x);
+ b[1] = Sum565W(b5 + 1);
+ b5[0] = b5[2];
+ ma[2] = Sum343<8>(ma3x);
+ b[2] = Sum343W(b3 + 1);
+ b3[0] = b3[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x + 8);
+ b[0].val[0] = vld1q_u32(b565 + x + 8);
+ b[0].val[1] = vld1q_u32(b565 + x + 12);
+ p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+ ma[0] = vld1q_u16(ma343 + x + 8);
+ ma[1] = vld1q_u16(ma444 + x + 8);
+ b[0].val[0] = vld1q_u32(b343 + x + 8);
+ b[0].val[1] = vld1q_u32(b343 + x + 12);
+ b[1].val[0] = vld1q_u32(b444 + x + 8);
+ b[1].val[1] = vld1q_u32(b444 + x + 12);
+ p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
+ b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+ ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+ dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+ ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+ dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+ b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+ nullptr, b343[0], nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+ ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+ ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+
+#if LIBGAV1_MSAN
+ // Initialize to prevent msan warnings when intermediate overreads occur.
+ memset(sgr_buffer, 0, sizeof(SgrBuffer));
+#endif
+
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->loop_restorations[0] = WienerFilter_NEON;
+ dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// This function is not thread-safe.
+void LoopRestorationInit_NEON();
+void LoopRestorationInit10bpp_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int subsampling_y>
+inline uint8x8_t GetMask4x2(const uint8_t* mask) {
+ if (subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz));
+ const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz));
+
+ const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]),
+ vreinterpret_u8_u32(row_02_13.val[1])),
+ 1);
+ }
+ // subsampling_x == 1
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetMask8(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(
+ vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1);
+ }
+ if (subsampling_x == 1) {
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ return vld1_u8(mask);
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+ const int16_t* LIBGAV1_RESTRICT const pred_1,
+ const int16x8_t pred_mask_0,
+ const int16x8_t pred_mask_1,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_0_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_0_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi =
+ vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+ vget_high_s16(pred_val_1));
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const uint8x8_t result =
+ vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+ StoreLo4(dst, result);
+ StoreHi4(dst + dst_stride, result);
+}
+
+template <int subsampling_y>
+inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ // Compound predictors use int16_t values and need to multiply long because
+ // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply
+ // int16_t by int8_t and accumulate into int32_t instruction.
+ int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+ int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
+
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+}
+
+template <int subsampling_y>
+inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride);
+ return;
+ }
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = 0;
+ do {
+ int16x8_t pred_mask_0 =
+ vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask)));
+ int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
+
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
+
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
+
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
+ pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
+ y += 8;
+ } while (y < height);
+}
+
+inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const int16x8_t pred_mask_0,
+ const int16x8_t pred_mask_1) {
+ // First 8 values.
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi = vmlal_s16(
+ weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = height;
+ do {
+ const int16x8_t pred_mask_0 =
+ ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask));
+ // 64 - mask
+ const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ const uint8x8_t result =
+ CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1);
+ vst1_u8(dst, result);
+ dst += dst_stride;
+ mask += 8 << (subsampling_x + subsampling_y);
+ pred_0 += 8;
+ pred_1 += 8;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val0 = vld2q_u8(mask);
+ const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride);
+ const uint8x16_t combined_horz0 =
+ vaddq_u8(mask_val0.val[0], mask_val0.val[1]);
+ const uint8x16_t combined_horz1 =
+ vaddq_u8(mask_val1.val[0], mask_val1.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1);
+ }
+ if (subsampling_x == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ return vrhaddq_u8(mask_val.val[0], mask_val.val[1]);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ return vld1q_u8(mask);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ if (width == 4) {
+ MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst,
+ dst_stride);
+ return;
+ }
+ if (width == 8) {
+ MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr,
+ height, dst, dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0));
+ const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0));
+ // 64 - mask
+ const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo);
+ const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi);
+
+ uint8x8_t result;
+ result =
+ CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo);
+ vst1_u8(dst + x, result);
+
+ result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi,
+ pred_mask_1_hi);
+ vst1_u8(dst + x + 8, result);
+
+ x += 16;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ return GetMask4x2<subsampling_y>(mask);
+ }
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val0 = Load4(mask);
+ return Load4<1>(mask + mask_stride, mask_val0);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+ const uint8_t* LIBGAV1_RESTRICT const pred_0,
+ uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+ const uint8x8_t pred_mask_0, const uint8x8_t pred_mask_1) {
+ const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+ uint8x8_t pred_val_1 = Load4(pred_1);
+ pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
+
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ StoreLo4(pred_1, result);
+ StoreHi4(pred_1 + pred_stride_1, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_NEON(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ uint8x8_t pred_mask_1 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+
+ pred_mask_1 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_NEON(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int height) {
+ if (height == 4) {
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ return;
+ }
+ int y = 0;
+ do {
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+
+ InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp8xH_NEON(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int height) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = height;
+ do {
+ const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask);
+ // 64 - mask
+ const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+ const uint8x8_t pred_val_1 = vld1_u8(pred_1);
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ vst1_u8(pred_1, result);
+
+ pred_0 += 8;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend8bpp_NEON(
+ const uint8_t* LIBGAV1_RESTRICT prediction_0,
+ uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height) {
+ if (width == 4) {
+ InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ if (width == 8) {
+ InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const uint8x16_t mask_inverter = vdupq_n_u8(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0);
+ prediction_0 += 8;
+ const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0);
+ prediction_0 += 8;
+ // Ensure armv7 build combines the load.
+ const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x);
+ const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1);
+ const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1);
+ const uint16x8_t weighted_pred_0_lo =
+ vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo_lo =
+ vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo);
+ const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6);
+ vst1_u8(prediction_1 + x, result_lo);
+ const uint16x8_t weighted_pred_0_hi =
+ vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo_hi = vmlal_u8(
+ weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi);
+ const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6);
+ vst1_u8(prediction_1 + x + 8, result_hi);
+
+ x += 16;
+ } while (x < width);
+ prediction_1 += prediction_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>;
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const uint8x8_t mask_val0 = vld1_u8(mask);
+ const uint8x8_t mask_val1 = vld1_u8(mask + (mask_stride << subsampling_y));
+ uint16x8_t final_val = vpaddlq_u8(vcombine_u8(mask_val0, mask_val1));
+ if (subsampling_y == 1) {
+ const uint8x8_t next_mask_val0 = vld1_u8(mask + mask_stride);
+ const uint8x8_t next_mask_val1 = vld1_u8(mask + mask_stride * 3);
+ final_val = vaddq_u16(
+ final_val, vpaddlq_u8(vcombine_u8(next_mask_val0, next_mask_val1)));
+ }
+ return vrshrq_n_u16(final_val, subsampling_y + 1);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val0 = Load4(mask);
+ const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+ return vmovl_u8(mask_val);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ uint16x8_t mask_val = vpaddlq_u8(vld1q_u8(mask));
+ if (subsampling_y == 1) {
+ const uint16x8_t next_mask_val = vpaddlq_u8(vld1q_u8(mask + mask_stride));
+ mask_val = vaddq_u16(mask_val, next_mask_val);
+ }
+ return vrshrq_n_u16(mask_val, 1 + subsampling_y);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const uint8x8_t mask_val = vld1_u8(mask);
+ return vmovl_u8(mask_val);
+}
+
+template <bool is_inter_intra>
+uint16x8_t SumWeightedPred(const uint16x8_t pred_mask_0,
+ const uint16x8_t pred_mask_1,
+ const uint16x8_t pred_val_0,
+ const uint16x8_t pred_val_1) {
+ if (is_inter_intra) {
+ // dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+ // mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+ uint16x8_t sum = vmulq_u16(pred_mask_1, pred_val_0);
+ sum = vmlaq_u16(sum, pred_mask_0, pred_val_1);
+ return vrshrq_n_u16(sum, 6);
+ } else {
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const uint32x4_t weighted_pred_0_lo =
+ vmull_u16(vget_low_u16(pred_mask_0), vget_low_u16(pred_val_0));
+ const uint32x4_t weighted_pred_0_hi = VMullHighU16(pred_mask_0, pred_val_0);
+ uint32x4x2_t sum;
+ sum.val[0] = vmlal_u16(weighted_pred_0_lo, vget_low_u16(pred_mask_1),
+ vget_low_u16(pred_val_1));
+ sum.val[1] = VMlalHighU16(weighted_pred_0_hi, pred_mask_1, pred_val_1);
+ return vcombine_u16(vshrn_n_u32(sum.val[0], 6), vshrn_n_u32(sum.val[1], 6));
+ }
+}
+
+template <bool is_inter_intra, int width, int bitdepth = 10>
+inline void StoreShiftedResult(uint8_t* dst, const uint16x8_t result,
+ const ptrdiff_t dst_stride = 0) {
+ if (is_inter_intra) {
+ if (width == 4) {
+ // Store 2 lines of width 4.
+ assert(dst_stride != 0);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(result));
+ vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+ vget_high_u16(result));
+ } else {
+ // Store 1 line of width 8.
+ vst1q_u16(reinterpret_cast<uint16_t*>(dst), result);
+ }
+ } else {
+ // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ const uint16x8_t compound_result =
+ vminq_u16(vrshrq_n_u16(vqsubq_u16(result, vdupq_n_u16(kCompoundOffset)),
+ inter_post_round_bits),
+ vdupq_n_u16((1 << bitdepth) - 1));
+ if (width == 4) {
+ // Store 2 lines of width 4.
+ assert(dst_stride != 0);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(compound_result));
+ vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+ vget_high_u16(compound_result));
+ } else {
+ // Store 1 line of width 8.
+ vst1q_u16(reinterpret_cast<uint16_t*>(dst), compound_result);
+ }
+ }
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend4x2_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const uint16x8_t mask_inverter,
+ const ptrdiff_t mask_stride,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ // This works because stride == width == 4.
+ const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+ const uint16x8_t pred_val_1 =
+ is_inter_intra
+ ? vcombine_u16(vld1_u16(pred_1), vld1_u16(pred_1 + pred_stride_1))
+ : vld1q_u16(pred_1);
+ const uint16x8_t pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+ const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+ pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+ StoreShiftedResult<is_inter_intra, 4>(dst, weighted_pred_sum, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4x4_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ // Double stride because the function works on 2 lines at a time.
+ const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+ const ptrdiff_t dst_stride_y = dst_stride << 1;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride_y;
+ dst += dst_stride_y;
+
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4xH_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlending4x4_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ // Double stride because the function works on 2 lines at a time.
+ const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+ const ptrdiff_t dst_stride_y = dst_stride << 1;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ int y = 0;
+ do {
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride_y;
+ dst += dst_stride_y;
+
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride_y;
+ dst += dst_stride_y;
+
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride_y;
+ dst += dst_stride_y;
+
+ MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride_y;
+ dst += dst_stride_y;
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+void MaskBlend8_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const uint16x8_t mask_inverter,
+ const ptrdiff_t mask_stride,
+ uint8_t* LIBGAV1_RESTRICT dst) {
+ const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+ const uint16x8_t pred_val_1 = vld1q_u16(pred_1);
+ const uint16x8_t pred_mask_0 =
+ GetMask8<subsampling_x, subsampling_y>(mask, mask_stride);
+ const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+ const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+ pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+ StoreShiftedResult<is_inter_intra, 8>(dst, weighted_pred_sum);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
+ if (!is_inter_intra) {
+ assert(prediction_stride_1 == width);
+ }
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ if (width == 4) {
+ MaskBlending4xH_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0, pred_1, prediction_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const ptrdiff_t mask_stride_y = mask_stride << subsampling_y;
+ const uint8_t* mask = mask_ptr;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ MaskBlend8_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+ pred_0 + x, pred_1 + x, mask + (x << subsampling_x), mask_inverter,
+ mask_stride,
+ reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(dst) + x));
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += prediction_stride_1;
+ mask += mask_stride_y;
+ } while (++y < height);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0, false>;
+ dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0, false>;
+ dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1, false>;
+
+ dsp->mask_blend[0][1] = MaskBlend_NEON<0, 0, true>;
+ dsp->mask_blend[1][1] = MaskBlend_NEON<1, 0, true>;
+ dsp->mask_blend[2][1] = MaskBlend_NEON<1, 1, true>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
+ const int8x8_t reference_offset) {
+ const int8x8_t kOne = vcreate_s8(0x0100010001000100);
+ const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
+ const int8x8_t t = vadd_s8(reference_offset, reference_offset);
+ const int8x8x2_t tt = vzip_s8(t, t);
+ const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
+ const int8x16_t idx = vaddq_s8(t1, kOneQ);
+ const int8x8_t idx_low = vget_low_s8(idx);
+ const int8x8_t idx_high = vget_high_s8(idx);
+ const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+ const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+ return vcombine_s16(d0, d1);
+}
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+ const int numerator) {
+ const int32x4_t m0 = vmull_s16(mv, denominator);
+ const int32x4_t m = vmulq_n_s32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x8_t MvProjectionClip(const int16x8_t mv,
+ const int16x8_t denominator,
+ const int numerator) {
+ const int16x4_t mv0 = vget_low_s16(mv);
+ const int16x4_t mv1 = vget_high_s16(mv);
+ const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+ const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+ const int16x8_t projection = vcombine_s16(s0, s1);
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
+ return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+ const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+ const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+ const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+ const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+ const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+ const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+ const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+ return vqmovn_s16(offset2);
+}
+
+inline void GetPosition(
+ const int8x8x2_t division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+ const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+ const int16x8_t d_sign, const int delta, int8x8_t* const r,
+ int8x8_t* const position_y8, int8x8_t* const position_x8,
+ int64_t* const skip_64, int32x4_t mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = vtbl1_s8(r_offsets, source_reference_type8);
+ const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
+ int16x8_t projection_mv[2];
+ mvs[0] = vld1q_s32(mv_int + 0);
+ mvs[1] = vld1q_s32(mv_int + 4);
+ // Deinterlace x and y components
+ const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+ const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+ const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ *position_y8 = Project_NEON(projection_mv[0], d_sign);
+ const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
+ const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
+ *position_x8 = vqadd_s8(position_x, k01234567);
+ const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling = std::min(
+ x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset); // [0, 16]
+ const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
+ const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
+ const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
+ const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
+ const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
+ const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
+ const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
+ const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
+ const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
+ *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
+}
+
+template <int idx>
+inline void Store(const int16x8_t position, const int8x8_t reference_offset,
+ const int32x4_t mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+ auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+ vst1q_lane_s32(d_mv, mv, idx & 3);
+ vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const int16x8_t position,
+ const int8x8_t reference_offset, const int32x4_t mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign,
+ const int dst_sign, const int y8_start,
+ const int y8_end, const int x8_start,
+ const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const int16x8_t d_sign = vdupq_n_s16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const int8x8_t skip_reference =
+ vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+ const int8x8_t r_offsets = vld1_s8(reference_offsets);
+ const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+ int8x8x2_t division_table;
+ division_table.val[0] = vget_low_s8(table);
+ division_table.val[1] = vget_high_s8(table);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); // [1, 8]
+ const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
+ const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const int8x8_t source_reference_type8 =
+ vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+ const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
+ const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ int8x8_t r, position_x8, position_y8;
+ int32x4_t mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
+ &position_x8, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const int16x8_t p_y = vmovl_s8(position_y8);
+ const int16x8_t p_x = vmovl_s8(position_x8);
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const int8x8_t source_reference_type8 = vld1_s8(
+ reinterpret_cast<const int8_t*>(source_reference_types + x8));
+ const int8x8_t skip_r =
+ vtbl1_s8(skip_reference, source_reference_type8);
+ const int64_t early_skip =
+ vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ int8x8_t r, position_x8, position_y8;
+ int32x4_t mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_y8, &position_x8, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const int16x8_t p_y = vmovl_s8(position_y8);
+ const int16x8_t p_x = vmovl_s8(position_x8);
+ const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+ const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+} // namespace
+
+void MotionFieldProjectionInit_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+ const int32x4_t numerator) {
+ const int32x4_t m0 = vmull_s16(mv, denominator);
+ const int32x4_t m = vmulq_s32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+ const int temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const int16x4_t denominator =
+ vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+ const int32x2_t offset = vld1_s32(reference_offsets);
+ const int32x2x2_t offsets = vzip_s32(offset, offset);
+ const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+ return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t mv = vcombine_s16(mv0, mv1);
+ const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+ return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const int32x2_t temporal_mv = vld1_s32(tmvs);
+ const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+ const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+ const int16x4_t mv0 = MvProjectionCompound(
+ tmv0, temporal_reference_offsets[0], reference_offsets);
+ const int16x4_t mv1 = MvProjectionCompound(
+ tmv1, temporal_reference_offsets[1], reference_offsets);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offset, int16x4_t* const lookup) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const int16x8_t temporal_mv = vld1q_s16(tmvs);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+ const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+ const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+ const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+ const int32x4_t numerator = vdupq_n_s32(reference_offset);
+ const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+ const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+ const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count != 0);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count != 0);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count != 0);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count != 0);
+}
+
+} // namespace
+
+void MotionVectorSearchInit_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+#include "src/dsp/obmc.inc"
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+ const uint8x8_t pred_mask,
+ const uint8x8_t obmc_pred_mask) {
+ const uint8x8_t pred_val = Load4(pred);
+ const uint8x8_t obmc_pred_val = Load4(obmc_pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ StoreLo4(pred, result);
+}
+
+inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
+ const uint8x8_t obmc_pred_val,
+ const uint8x8_t pred_mask,
+ const uint8x8_t obmc_pred_mask) {
+ const uint8x8_t pred_val = vld1_u8(pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ vst1_u8(pred, result);
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+ const ptrdiff_t obmc_prediction_stride) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x8_t pred_mask = Load2(kObmcMask);
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ uint8x8_t pred_val = vdup_n_u8(0);
+ uint8x8_t obmc_pred_val = vdup_n_u8(0);
+ int y = 0;
+ do {
+ pred_val = Load2<0>(pred, pred_val);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ Store2<0>(pred, result);
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+ const ptrdiff_t obmc_prediction_stride) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x8_t pred_mask = Load4(kObmcMask + 2);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ y += 2;
+ } while (y != height);
+}
+
+inline void OverlapBlendFromLeft8xH_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+ constexpr int obmc_prediction_stride = 8;
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ pred += prediction_stride;
+
+ obmc_pred += obmc_prediction_stride << 1;
+ y += 2;
+ } while (y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 2);
+ assert(height >= 4);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ const uint8x16_t mask_inverter = vdupq_n_u8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint8_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+ const uint8x16_t pred_mask = vld1q_u8(mask + x);
+ // 64 - mask
+ const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint8x16_t pred_val = vld1q_u8(pred);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+ const uint16x8_t weighted_pred_lo =
+ vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
+ const uint8x8_t result_lo =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
+ vget_low_u8(obmc_pred_val)),
+ 6);
+ const uint16x8_t weighted_pred_hi =
+ vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
+ const uint8x8_t result_hi =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
+ vget_high_u8(obmc_pred_val)),
+ 6);
+ vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < height);
+ x += 16;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4x4_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+ const ptrdiff_t obmc_prediction_stride, const int height) {
+ uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ if (height == 2) {
+ return;
+ }
+
+ pred_mask = vdup_n_u8(kObmcMask[3]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(kObmcMask[4]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+ const ptrdiff_t obmc_prediction_stride) {
+ if (height < 8) {
+ OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
+ obmc_prediction_stride, height);
+ return;
+ }
+ const uint8_t* mask = kObmcMask + height - 2;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = 0;
+ // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+ // lines are unchanged as the corresponding mask value is 64.
+ do {
+ uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 1]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 2]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 3]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 4]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vdup_n_u8(mask[y + 5]);
+ obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ // Increment for the right mask index.
+ y += 6;
+ } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+ uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 8;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
+ obmc_pred_mask0);
+ pred += prediction_stride;
+ ++y;
+
+ const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
+ obmc_pred_mask1);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (++y < compute_height);
+}
+
+void OverlapBlendFromTop_NEON(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 4);
+ assert(height >= 2);
+
+ if (width == 4) {
+ OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+
+ if (width == 8) {
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+
+ const uint8_t* mask = kObmcMask + height - 2;
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ // Stop when mask value becomes 64. This is inferred for 4xH.
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+ int x = 0;
+ do {
+ const uint8x16_t pred_val = vld1q_u8(pred + x);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
+ const uint16x8_t weighted_pred_lo =
+ vmull_u8(pred_mask, vget_low_u8(pred_val));
+ const uint8x8_t result_lo =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
+ vget_low_u8(obmc_pred_val)),
+ 6);
+ const uint16x8_t weighted_pred_hi =
+ vmull_u8(pred_mask, vget_high_u8(pred_val));
+ const uint8x8_t result_hi =
+ vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
+ vget_high_u8(obmc_pred_val)),
+ 6);
+ vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
+
+ x += 16;
+ } while (x < width);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < compute_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2. The value 64 leaves the result
+// equal to |pred| and may be ignored if convenient. Vector loads may overrread
+// values meant for larger sizes, but these values will be unused.
+constexpr uint16_t kObmcMask[62] = {
+ // Obmc Mask 2
+ 45, 64,
+ // Obmc Mask 4
+ 39, 50, 59, 64,
+ // Obmc Mask 8
+ 36, 42, 48, 53, 57, 61, 64, 64,
+ // Obmc Mask 16
+ 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+ // Obmc Mask 32
+ 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+ 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
+
+inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
+ const uint16x4_t pred_mask,
+ const uint16x4_t obmc_pred_mask) {
+ const uint16x4_t pred_val = vld1_u16(pred);
+ const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
+ const uint16x4_t result =
+ vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ return result;
+}
+
+inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+ const uint16x8_t pred_mask,
+ const uint16x8_t obmc_pred_mask) {
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
+ const uint16x8_t result =
+ vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ return result;
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 2;
+ const uint16x4_t mask_inverter = vdup_n_u16(64);
+ // Second two lanes unused.
+ const uint16x4_t pred_mask = vld1_u16(kObmcMask);
+ const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
+ const uint16x4_t result_0 =
+ BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_0);
+
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
+ const uint16x4_t result_1 =
+ BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_1);
+
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ y += 2;
+ } while (y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 4;
+ const uint16x4_t mask_inverter = vdup_n_u16(64);
+ const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
+ // 64 - mask
+ const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_0);
+ pred = AddByteStride(pred, prediction_stride);
+
+ const uint16x4_t result_1 = BlendObmc2Or4(
+ pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ y += 2;
+ } while (y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ assert(width >= 2);
+ assert(height >= 4);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ const uint16_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ uint16_t* pred_x = pred + x;
+ const uint16_t* obmc_pred_x = obmc_pred + x;
+ const uint16x8_t pred_mask = vld1q_u16(mask + x);
+ // 64 - mask
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ int y = 0;
+ do {
+ const uint16x8_t result =
+ BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred_x, result);
+
+ pred_x = AddByteStride(pred_x, prediction_stride);
+ obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int lane>
+inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
+ const uint16x8_t pred_mask,
+ const uint16x8_t obmc_pred_mask) {
+ const uint16x4_t pred_val = vld1_u16(pred);
+ const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
+ const uint16x4_t result = vrshr_n_u16(
+ VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+ return result;
+}
+
+template <int lane>
+inline uint16x8_t BlendObmcFromTop8(
+ uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+ const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
+ const uint16x8_t result = vrshrq_n_u16(
+ VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+ return result;
+}
+
+inline void OverlapBlendFromTop4x2Or4_NEON(
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+ constexpr int obmc_prediction_stride = 4;
+ const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ if (height == 2) {
+ // Mask value is 64, meaning |pred| is unchanged.
+ return;
+ }
+
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
+ result =
+ BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ if (height < 8) {
+ OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
+ return;
+ }
+ constexpr int obmc_prediction_stride = 4;
+ const uint16_t* mask = kObmcMask + height - 2;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ int y = 0;
+ // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+ // lines are unchanged as the corresponding mask value is 64.
+ do {
+ const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ // Load obmc row 0, 1.
+ uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 2, 3.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 4, 5.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Increment for the right mask index.
+ y += 6;
+ } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+ const uint16_t* mask = kObmcMask + height - 2;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ uint16x8_t pred_mask = vld1q_u16(mask);
+ uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ uint16x8_t result =
+ BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ if (height == 2) return;
+
+ constexpr int obmc_prediction_stride = 8;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ if (height == 4) return;
+
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+
+ if (height == 8) return;
+
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vld1q_u16(&mask[8]);
+ obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+ result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+
+ if (height == 16) return;
+
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ pred_mask = vld1q_u16(&mask[16]);
+ obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+ result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride;
+
+ result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred, result);
+}
+
+void OverlapBlendFromTop_NEON(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ assert(width >= 4);
+ assert(height >= 2);
+
+ if (width == 4) {
+ OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+
+ if (width == 8) {
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
+ return;
+ }
+
+ const uint16_t* mask = kObmcMask + height - 2;
+ const uint16x8_t mask_inverter = vdupq_n_u16(64);
+ const uint16x8_t pred_mask = vld1q_u16(mask);
+ // 64 - mask
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+#define OBMC_ROW_FROM_TOP(n) \
+ do { \
+ int x = 0; \
+ do { \
+ const uint16x8_t result = BlendObmcFromTop8<n>( \
+ pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
+ vst1q_u16(pred + x, result); \
+ \
+ x += 8; \
+ } while (x < width); \
+ } while (false)
+
+ // Compute 1 row.
+ if (height == 2) {
+ OBMC_ROW_FROM_TOP(0);
+ return;
+ }
+
+ // Compute 3 rows.
+ if (height == 4) {
+ OBMC_ROW_FROM_TOP(0);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(2);
+ return;
+ }
+
+ // Compute 6 rows.
+ if (height == 8) {
+ OBMC_ROW_FROM_TOP(0);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(2);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(3);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(4);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(5);
+ return;
+ }
+
+ // Compute 12 rows.
+ if (height == 16) {
+ OBMC_ROW_FROM_TOP(0);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(2);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(3);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(4);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(5);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(6);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(7);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+
+ const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
+ // 64 - mask
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ OBMC_ROW_FROM_TOP(0);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(2);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(3);
+ return;
+ }
+
+ // Stop when mask value becomes 64. This is a multiple of 8 for height 32
+ // and 64.
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+ // 64 - mask
+ const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+ OBMC_ROW_FROM_TOP(0);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(2);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(3);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(4);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(5);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(6);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+ OBMC_ROW_FROM_TOP(7);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
+
+ y += 8;
+ } while (y < compute_height);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If NEON is enabled, signal the NEON implementation should be used.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint8x8_t filter[8];
+ uint8x16_t d[kSuperResFilterTaps / 2];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ filter[i] =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ }
+ Transpose8x8(filter, d);
+ vst1q_u8(dst, d[0]);
+ dst += 16;
+ vst1q_u8(dst, d[1]);
+ dst += 16;
+ vst1q_u8(dst, d[2]);
+ dst += 16;
+ vst1q_u8(dst, d[3]);
+ dst += 16;
+ } while (--x != 0);
+}
+
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+ const uint8_t** coefficients) {
+ uint8x16_t f[kSuperResFilterTaps / 2];
+ for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+ f[i] = vld1q_u8(*coefficients);
+ }
+ uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+ res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+ res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+ res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+ uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+ temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+ temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+ temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+ res = vqsubq_u16(res, temp);
+ return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+ // Initialize the padding area to prevent msan warnings.
+ const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+ const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, super_res_right_border);
+ int subpixel_x = initial_subpixel_x;
+ uint8x8_t sr[8];
+ uint8x16_t s[8];
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ // The below code calculates up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. kSuperResHorizontalPadding accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ s[i] = vcombine_u8(sr[i], s_hi);
+ }
+ Transpose8x16(s);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_low_u8(s[0]);
+ sr[1] = vget_low_u8(s[1]);
+ sr[2] = vget_low_u8(s[2]);
+ sr[3] = vget_low_u8(s[3]);
+ sr[4] = vget_low_u8(s[4]);
+ sr[5] = vget_low_u8(s[5]);
+ sr[6] = vget_low_u8(s[6]);
+ sr[7] = vget_low_u8(s[7]);
+ const uint8x8_t d0 = SuperRes(sr, &filter);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_high_u8(s[0]);
+ sr[1] = vget_high_u8(s[1]);
+ sr[2] = vget_high_u8(s[2]);
+ sr[3] = vget_high_u8(s[3]);
+ sr[4] = vget_high_u8(s[4]);
+ sr[5] = vget_high_u8(s[5]);
+ sr[6] = vget_high_u8(s[6]);
+ sr[7] = vget_high_u8(s[7]);
+ const uint8x8_t d1 = SuperRes(sr, &filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint16x8_t filter[8];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t filter_8 =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ // uint8_t -> uint16_t
+ filter[i] = vmovl_u8(filter_8);
+ }
+
+ Transpose8x8(filter);
+
+ vst1q_u16(dst, filter[0]);
+ dst += 8;
+ vst1q_u16(dst, filter[1]);
+ dst += 8;
+ vst1q_u16(dst, filter[2]);
+ dst += 8;
+ vst1q_u16(dst, filter[3]);
+ dst += 8;
+ vst1q_u16(dst, filter[4]);
+ dst += 8;
+ vst1q_u16(dst, filter[5]);
+ dst += 8;
+ vst1q_u16(dst, filter[6]);
+ dst += 8;
+ vst1q_u16(dst, filter[7]);
+ dst += 8;
+ } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+ const uint16_t** coefficients, int bitdepth) {
+ uint16x8_t f[kSuperResFilterTaps];
+ for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+ f[i] = vld1q_u16(*coefficients);
+ }
+
+ uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+ uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+ res_lo = vqsubq_u32(res_lo, temp_lo);
+
+ uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+
+ uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+ res_hi = vqsubq_u32(res_hi, temp_hi);
+
+ const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+ vqrshrn_n_u32(res_hi, kFilterBits));
+
+ // Clip the result at (1 << bd) - 1.
+ return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+ // Initialize the padding area to prevent msan warnings.
+ const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+ const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, super_res_right_border);
+ int subpixel_x = initial_subpixel_x;
+ uint16x8_t sr[8];
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+
+ Transpose8x8(sr);
+
+ const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+ vst1q_u16(dst_ptr, d0);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON<10>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+ (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+//
+// src_row_centered contains 16 "centered" samples of a source row. (We center
+// the samples by subtracting 128 from the samples.)
+void HorizontalFilter(const int sx4, const int16_t alpha,
+ const int8x16_t src_row_centered,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ int8x8_t filter[8];
+ for (auto& f : filter) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = vld1_s8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+ Transpose8x8(filter);
+ // Add kFirstPassOffset to ensure |sum| stays within uint16_t.
+ // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the
+ // centering of the source samples. These combined are 1 << 15 or -32768.
+ int16x8_t sum =
+ vdupq_n_s16(static_cast<int16_t>(kFirstPassOffset + 128 * 128));
+ // Unrolled k = 0..7 loop. We need to manually unroll the loop because the
+ // third argument (an index value) to vextq_s8() must be a constant
+ // (immediate). src_row_window is a sliding window of length 8 into
+ // src_row_centered.
+ // k = 0.
+ int8x8_t src_row_window = vget_low_s8(src_row_centered);
+ sum = vmlal_s8(sum, filter[0], src_row_window);
+ // k = 1.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1));
+ sum = vmlal_s8(sum, filter[1], src_row_window);
+ // k = 2.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2));
+ sum = vmlal_s8(sum, filter[2], src_row_window);
+ // k = 3.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3));
+ sum = vmlal_s8(sum, filter[3], src_row_window);
+ // k = 4.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4));
+ sum = vmlal_s8(sum, filter[4], src_row_window);
+ // k = 5.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5));
+ sum = vmlal_s8(sum, filter[5], src_row_window);
+ // k = 6.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6));
+ sum = vmlal_s8(sum, filter[6], src_row_window);
+ // k = 7.
+ src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7));
+ sum = vmlal_s8(sum, filter[7], src_row_window);
+ // End of unrolled k = 0..7 loop.
+ // Due to the offset |sum| is guaranteed to be unsigned.
+ uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum);
+ sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal);
+ // After the shift |sum_unsigned| will fit into int16_t.
+ vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int source_width,
+ const int source_height,
+ const int* LIBGAV1_RESTRICT const warp_params,
+ const int subsampling_x, const int subsampling_y,
+ const int block_start_x, const int block_start_y,
+ const int block_width, const int block_height,
+ const int16_t alpha, const int16_t beta, const int16_t gamma,
+ const int16_t delta, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can multiply it by kWarpedFilters (which has signed
+ // values) using vmlal_s16().
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const auto* const src = static_cast<const uint8_t*>(source);
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint8_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+
+ // Warp process applies for each 8x8 block.
+ int start_y = block_start_y;
+ do {
+ int start_x = block_start_x;
+ do {
+ const int src_x = (start_x + 4) << subsampling_x;
+ const int src_y = (start_y + 4) << subsampling_y;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
+ // Regions 1 and 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel =
+ first_row_border[row * source_stride];
+
+ DestType* dst_row = dst + start_x - block_start_x;
+ for (int y = 0; y < 8; ++y) {
+ if (is_compound) {
+ const int16x8_t sum =
+ vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+ kRoundBitsVertical));
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ dst_row += dest_stride;
+ }
+ // End of region 1. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = filter_params.iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+ const int16x8_t intermediate =
+ vld1q_s16(&intermediate_result_column[y]);
+ int16_t tmp[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+ const int32x4_t product_low =
+ vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+ const int32x4_t product_high =
+ vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+ // vaddvq_s32 is only available on __aarch64__.
+ const int32_t sum =
+ vaddvq_s32(product_low) + vaddvq_s32(product_high);
+ const int16_t sum_descale =
+ RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ dst_row[x] = sum_descale;
+ } else {
+ tmp[x] = sum_descale;
+ }
+ sy += gamma;
+ }
+ if (!is_compound) {
+ const int16x8_t sum = vld1q_s16(tmp);
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+#else // !defined(__aarch64__)
+ int16x8_t filter[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ filter[x] = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(0);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16_t intermediate = intermediate_result_column[y + k];
+ sum_low =
+ vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+ sum_high =
+ vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+ }
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ if (is_compound) {
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+#endif // defined(__aarch64__)
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ // End of region 2. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Regions 3 and 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 3.
+ // Horizontal filter.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ const int8x16_t src_row_centered =
+ vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_centered,
+ intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ } else {
+ // Region 4.
+ // Horizontal filter.
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = filter_params.iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const uint8x16_t src_row_v =
+ vld1q_u8(&src_row[filter_params.ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ const int8x16_t src_row_centered =
+ vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+ HorizontalFilter(sx4, alpha, src_row_centered,
+ intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ }
+
+ // Regions 3 and 4.
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ int16x8_t filter[8];
+ for (auto& f : filter) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+ sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+ vget_low_s16(intermediate));
+ sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+ vget_high_s16(intermediate));
+ }
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ if (is_compound) {
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+ } else {
+ vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ start_x += 8;
+ } while (start_x < block_start_x + block_width);
+ dst += 8 * dest_stride;
+ start_y += 8;
+ } while (start_y < block_start_y + block_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_NEON</*is_compound=*/false>;
+ dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+LIBGAV1_ALWAYS_INLINE uint16x8x2_t LoadSrcRow(uint16_t const* ptr) {
+ uint16x8x2_t x;
+ // Clang/gcc uses ldp here.
+ x.val[0] = vld1q_u16(ptr);
+ x.val[1] = vld1q_u16(ptr + 8);
+ return x;
+}
+
+LIBGAV1_ALWAYS_INLINE void HorizontalFilter(
+ const int sx4, const int16_t alpha, const uint16x8x2_t src_row,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ int8x8_t filter8[8];
+ for (auto& f : filter8) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = vld1_s8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+
+ Transpose8x8(filter8);
+
+ int16x8_t filter[8];
+ for (int i = 0; i < 8; ++i) {
+ filter[i] = vmovl_s8(filter8[i]);
+ }
+
+ int32x4x2_t sum;
+ int16x8_t src_row_window;
+ // k = 0.
+ src_row_window = vreinterpretq_s16_u16(src_row.val[0]);
+ sum.val[0] = vmull_s16(vget_low_s16(filter[0]), vget_low_s16(src_row_window));
+ sum.val[1] = VMullHighS16(filter[0], src_row_window);
+ // k = 1.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 1));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[1]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[1], src_row_window);
+ // k = 2.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 2));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[2]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[2], src_row_window);
+ // k = 3.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 3));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[3]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[3], src_row_window);
+ // k = 4.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 4));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[4]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[4], src_row_window);
+ // k = 5.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 5));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[5]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[5], src_row_window);
+ // k = 6.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 6));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[6]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[6], src_row_window);
+ // k = 7.
+ src_row_window =
+ vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 7));
+ sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[7]),
+ vget_low_s16(src_row_window));
+ sum.val[1] = VMlalHighS16(sum.val[1], filter[7], src_row_window);
+ // End of unrolled k = 0..7 loop.
+
+ vst1_s16(intermediate_result_row,
+ vrshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal));
+ vst1_s16(intermediate_result_row + 4,
+ vrshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int source_width,
+ const int source_height,
+ const int* LIBGAV1_RESTRICT const warp_params,
+ const int subsampling_x, const int subsampling_y,
+ const int block_start_x, const int block_start_y,
+ const int block_width, const int block_height,
+ const int16_t alpha, const int16_t beta, const int16_t gamma,
+ const int16_t delta, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can multiply it by kWarpedFilters (which has signed
+ // values) using vmlal_s16().
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = source_stride >> 1;
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint16_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+ const ptrdiff_t dst_stride = is_compound ? dest_stride : dest_stride >> 1;
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+
+ // Warp process applies for each 8x8 block.
+ int start_y = block_start_y;
+ do {
+ int start_x = block_start_x;
+ do {
+ const int src_x = (start_x + 4) << subsampling_x;
+ const int src_y = (start_y + 4) << subsampling_y;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
+ // Regions 1 and 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint16_t* first_row_border =
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint16_t row_border_pixel = first_row_border[row * src_stride];
+
+ DestType* dst_row = dst + start_x - block_start_x;
+ for (int y = 0; y < 8; ++y) {
+ if (is_compound) {
+ const int16x8_t sum =
+ vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+ kRoundBitsVertical));
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+ vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+ } else {
+ vst1q_u16(reinterpret_cast<uint16_t*>(dst_row),
+ vdupq_n_u16(row_border_pixel));
+ }
+ dst_row += dst_stride;
+ }
+ // End of region 1. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = filter_params.iy4 + y;
+ int sum = first_row_border[row * src_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+ const int16x8_t intermediate =
+ vld1q_s16(&intermediate_result_column[y]);
+ int16_t tmp[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+ const int32x4_t product_low =
+ vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+ const int32x4_t product_high =
+ vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+ // vaddvq_s32 is only available on __aarch64__.
+ const int32_t sum =
+ vaddvq_s32(product_low) + vaddvq_s32(product_high);
+ const int16_t sum_descale =
+ RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ dst_row[x] = sum_descale + kCompoundOffset;
+ } else {
+ tmp[x] = sum_descale;
+ }
+ sy += gamma;
+ }
+ if (!is_compound) {
+ const uint16x8_t v_max_bitdepth =
+ vdupq_n_u16((1 << kBitdepth10) - 1);
+ const int16x8_t sum = vld1q_s16(tmp);
+ const uint16x8_t d0 =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum, vdupq_n_s16(0))),
+ v_max_bitdepth);
+ vst1q_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+ }
+#else // !defined(__aarch64__)
+ int16x8_t filter[8];
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ filter[x] = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(0);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16_t intermediate = intermediate_result_column[y + k];
+ sum_low =
+ vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+ sum_high =
+ vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+ }
+ if (is_compound) {
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+ vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+ } else {
+ const uint16x4_t v_max_bitdepth =
+ vdup_n_u16((1 << kBitdepth10) - 1);
+ const uint16x4_t d0 = vmin_u16(
+ vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+ const uint16x4_t d1 = vmin_u16(
+ vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+ }
+#endif // defined(__aarch64__)
+ dst_row += dst_stride;
+ sy4 += delta;
+ }
+ // End of region 2. Continue the |start_x| do-while loop.
+ start_x += 8;
+ continue;
+ }
+
+ // Regions 3 and 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 3.
+ // Horizontal filter.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint16_t* const src_row = src + row * src_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 pixels before src_row[0] or up to 14
+ // pixels after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 pixels that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding pixel after the right border of the last source row.
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ } else {
+ // Region 4.
+ // Horizontal filter.
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = filter_params.iy4 + y;
+ const uint16_t* const src_row = src + row * src_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to pixels bytes before src_row[0] or up to
+ // 14 pixels after src_row[source_width - 1]. We assume the source
+ // frame has left and right borders of at least 13 pixels that extend
+ // the frame boundary pixels. We also assume there is at least one
+ // extra padding pixel after the right border of the last source row.
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+ }
+
+ // Regions 3 and 4.
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ int16x8_t filter[8];
+ for (auto& f : filter) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = vld1q_s16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8(filter);
+ int32x4_t sum_low = vdupq_n_s32(0);
+ int32x4_t sum_high = sum_low;
+ for (int k = 0; k < 8; ++k) {
+ const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+ sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+ vget_low_s16(intermediate));
+ sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+ vget_high_s16(intermediate));
+ }
+ if (is_compound) {
+ const int16x8_t sum =
+ vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+ vrshrn_n_s32(sum_high, kRoundBitsVertical));
+ vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+ vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+ } else {
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ const uint16x4_t d0 = vmin_u16(
+ vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+ const uint16x4_t d1 = vmin_u16(
+ vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+ vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+ }
+ dst_row += dst_stride;
+ sy4 += delta;
+ }
+ start_x += 8;
+ } while (start_x < block_start_x + block_width);
+ dst += 8 * dst_stride;
+ start_y += 8;
+ } while (start_y < block_start_y + block_height);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_NEON</*is_compound=*/false>;
+ dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WarpInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WarpCompound LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/weight_mask_neon.h"
+
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8x2_t LoadPred(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1) {
+ const int16x8x2_t pred = {vld1q_s16(prediction_0), vld1q_s16(prediction_1)};
+ return pred;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline uint16x8x2_t LoadPred(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1) {
+ const uint16x8x2_t pred = {vld1q_u16(prediction_0), vld1q_u16(prediction_1)};
+ return pred;
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const int16x8x2_t pred) {
+ static_assert(bitdepth == 8, "");
+ constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+ return vrshrq_n_u16(
+ vreinterpretq_u16_s16(vabdq_s16(pred.val[0], pred.val[1])),
+ rounding_bits);
+}
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const uint16x8x2_t pred) {
+ constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+ return vrshrq_n_u16(vabdq_u16(pred.val[0], pred.val[1]), rounding_bits);
+}
+
+template <bool mask_is_inverse, int bitdepth>
+inline void WeightMask8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask) {
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ using PredTypeVecx2 =
+ typename std::conditional<bitdepth == 8, int16x8x2_t, uint16x8x2_t>::type;
+ const PredTypeVecx2 pred =
+ LoadPred(static_cast<const PredType*>(prediction_0),
+ static_cast<const PredType*>(prediction_1));
+ const uint16x8_t difference = AbsolutePredDifference<bitdepth>(pred);
+ const uint8x8_t difference_offset = vdup_n_u8(38);
+ const uint8x8_t mask_ceiling = vdup_n_u8(64);
+ const uint8x8_t adjusted_difference =
+ vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset);
+ const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling);
+ if (mask_is_inverse) {
+ const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value);
+ vst1_u8(mask, inverted_mask_value);
+ } else {
+ vst1_u8(mask, mask_value);
+ }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+ WEIGHT8_WITHOUT_STRIDE; \
+ pred_0 += 8; \
+ pred_1 += 8; \
+ mask += mask_stride
+
+// |pred_0| and |pred_1| are cast as int16_t* for the sake of pointer math. They
+// are uint16_t* for 10bpp and 12bpp, and this is handled in WeightMask8_NEON.
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+ WEIGHT16_WITHOUT_STRIDE; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8, \
+ mask + 8); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+ mask + 16); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+ mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+ WEIGHT32_WITHOUT_STRIDE; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8, \
+ mask + 8); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+ mask + 16); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+ mask + 24); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 32, pred_1 + 32, \
+ mask + 32); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 40, pred_1 + 40, \
+ mask + 40); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 48, pred_1 + 48, \
+ mask + 48); \
+ WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 56, pred_1 + 56, \
+ mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+ WEIGHT64_WITHOUT_STRIDE; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 42);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 42);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+#undef WEIGHT8_WITHOUT_STRIDE
+#undef WEIGHT8_AND_STRIDE
+#undef WEIGHT16_WITHOUT_STRIDE
+#undef WEIGHT16_AND_STRIDE
+#undef WEIGHT32_WITHOUT_STRIDE
+#undef WEIGHT32_AND_STRIDE
+#undef WEIGHT64_WITHOUT_STRIDE
+#undef WEIGHT64_AND_STRIDE
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_NEON<0, 8>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_NEON<1, 8>
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_8BPP
+
+} // namespace
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_NEON<0, 10>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_NEON<1, 10>
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_10BPP
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+void WeightMaskInit_NEON() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void AverageBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // See warp.cc and convolve.cc for detailed prediction ranges.
+ int res = pred_0[x] + pred_1[x];
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0,
+ (1 << bitdepth) - 1));
+ } while (++x < width);
+
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+} // namespace
+
+void AverageBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/average_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/average_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e8;
+constexpr char kAverageBlend[] = "AverageBlend";
+// average_blend is applied to compound prediction values. This implies a range
+// far exceeding that of pixel values.
+// The ranges include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+template <int bitdepth, typename Pixel>
+class AverageBlendTest : public testing::TestWithParam<BlockSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ AverageBlendTest() = default;
+ ~AverageBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ AverageBlendInit_C();
+ DistanceWeightedBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_func_ = dsp->average_blend;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ AverageBlendInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ AverageBlendInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->average_blend;
+ dist_blend_func_ = dsp->distance_weighted_blend;
+ }
+
+ protected:
+ void Test(const char* digest, int num_tests, bool debug);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+ const int width_ = kBlockWidthPixels[GetParam()];
+ const int height_ = kBlockHeightPixels[GetParam()];
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+ {};
+ dsp::AverageBlendFunc base_func_;
+ dsp::AverageBlendFunc func_;
+ dsp::DistanceWeightedBlendFunc dist_blend_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
+ bool debug) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ PredType* src_1 = source1_;
+ PredType* src_2 = source2_;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ src_1 += width_;
+ src_2 += width_;
+ }
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_tests; ++i) {
+ const absl::Time start = absl::Now();
+ func_(source1_, source2_, width_, height_, dest_,
+ sizeof(dest_[0]) * kDestStride);
+ elapsed_time += absl::Now() - start;
+ }
+ if (debug) {
+ if (base_func_ != nullptr) {
+ base_func_(source1_, source2_, width_, height_, reference_,
+ sizeof(reference_[0]) * kDestStride);
+ } else {
+ // Use dist_blend_func_ as the base for C tests.
+ const int8_t weight = 8;
+ dist_blend_func_(source1_, source2_, weight, weight, width_, height_,
+ reference_, sizeof(reference_[0]) * kDestStride);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_,
+ kDestStride, kDestStride, false));
+ }
+
+ test_utils::CheckMd5Digest(kAverageBlend, ToString(GetParam()), digest, dest_,
+ sizeof(dest_[0]) * kDestStride * height_,
+ elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+ kBlock4x4, kBlock4x8, kBlock4x16, kBlock8x4, kBlock8x8,
+ kBlock8x16, kBlock8x32, kBlock16x4, kBlock16x8, kBlock16x16,
+ kBlock16x32, kBlock16x64, kBlock32x8, kBlock32x16, kBlock32x32,
+ kBlock32x64, kBlock64x16, kBlock64x32, kBlock64x64, kBlock64x128,
+ kBlock128x64, kBlock128x128,
+};
+
+using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
+
+const char* GetAverageBlendDigest8bpp(const BlockSize block_size) {
+ static const char* const kDigests[kMaxBlockSizes] = {
+ // 4xN
+ "152bcc35946900b1ed16369b3e7a81b7",
+ "c23e9b5698f7384eaae30a3908118b77",
+ "f2da31d940f62490c368c03d32d3ede8",
+ // 8xN
+ "73c95485ef956e1d9ab914e88e6a202b",
+ "d90d3abd368e58c513070a88b34649ba",
+ "77f7d53d0edeffb3537afffd9ff33a4a",
+ "460b9b1e6b83f65f013cfcaf67ec0122",
+ // 16xN
+ "96454a56de940174ff92e9bb686d6d38",
+ "a50e268e93b48ae39cc2a47d377410e2",
+ "65c8502ff6d78065d466f9911ed6bb3e",
+ "bc2c873b9f5d74b396e1df705e87f699",
+ "b4dae656484b2d255d1e09b7f34e12c1",
+ // 32xN
+ "7e1e5db92b22a96e5226a23de883d766",
+ "ca40d46d89773e7f858b15fcecd43cc0",
+ "bfdc894707323f4dc43d1326309f8368",
+ "f4733417621719b7feba3166ec0da5b9",
+ // 64xN
+ "378fa0594d22f01c8e8931c2a908d7c4",
+ "db38fe2e082bd4a09acb3bb1d52ee11e",
+ "3ad44401cc731215c46c9b7d96f7e4ae",
+ "6c43267be5ed03d204a05fe36090f870",
+ // 128xN
+ "c8cfe46ebf166c1cbf08e8804206aadb",
+ "b0557b5156d2334c8ce4a7ee12f9d6b4",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest8bpp, Blending) {
+ Test(GetAverageBlendDigest8bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest8bpp(GetParam()),
+ kNumSpeedTests /
+ (kBlockHeightPixels[GetParam()] * kBlockWidthPixels[GetParam()]),
+ false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
+
+const char* GetAverageBlendDigest10bpp(const BlockSize block_size) {
+ static const char* const kDigests[kMaxBlockSizes] = {
+ // 4xN
+ "98c0671c092b4288adcaaa17362cc4a3",
+ "7083f3def8bfb63ab3a985ef5616a923",
+ "a7211ee2eaa6f88e08875b377d17b0f1",
+ // 8xN
+ "11f9ab881700f2ef0f82d8d4662868c6",
+ "3bee144b9ea6f4288b860c24f88a22f3",
+ "27113bd17bf95034f100e9046c7b59d2",
+ "c42886a5e16e23a81e43833d34467558",
+ // 16xN
+ "b0ac2eb0a7a6596d6d1339074c7f8771",
+ "24c9e079b9a8647a6ee03f5441f2cdd9",
+ "dd05777751ccdb4356856c90e1176e53",
+ "27b1d69d035b1525c013b7373cfe3875",
+ "08c46403afe19e6b008ccc8f56633da9",
+ // 32xN
+ "36d434db11298aba76166df06e9b8125",
+ "efd24dd7b555786bff1a482e51170ea3",
+ "3b37ddac87de443cd18784f02c2d1dd5",
+ "80d8070939a743a20689a65bf5dc0a68",
+ // 64xN
+ "88e747246237c6408d0bd4cc3ecc8396",
+ "af1fe8c52487c9f2951c3ea516828abb",
+ "ea6f18ff56b053748c18032b7e048e83",
+ "af0cb87fe27d24c2e0afd2c90a8533a6",
+ // 128xN
+ "16a83b19911d6dc7278a694b8baa9901",
+ "bd22e77ce6fa727267ff63eeb4dcb19c",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest10bpp, Blending) {
+ Test(GetAverageBlendDigest10bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest10bpp(GetParam()),
+ kNumSpeedTests /
+ (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+ 2,
+ false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AverageBlendTest12bpp = AverageBlendTest<12, uint16_t>;
+
+const char* GetAverageBlendDigest12bpp(const BlockSize block_size) {
+ static const char* const kDigests[kMaxBlockSizes] = {
+ // 4xN
+ "8f5ad8fba61a0f1cb6b77f5460c241be",
+ "3a9d017848fdb4162315c689b4449ac6",
+ "bb97029fff021b168b98b209dcee5123",
+ // 8xN
+ "a7ff1b199965b8856499ae3f1b2c48eb",
+ "05220c72835fc4662d261183df0a57cf",
+ "97de8c325f1475c44e1afc44183e55ad",
+ "60d820c46cad14d9d934da238bb79707",
+ // 16xN
+ "f3e4863121819bc28f7c1f453898650c",
+ "5f5f68d21269d7df546c848921e8f2cd",
+ "17efe0b0fce1f8d4c7bc6eacf769063e",
+ "3da591e201f44511cdd6c465692ace1e",
+ "5a0ca6c88664d2e918a032b5fcf66070",
+ // 32xN
+ "efe236bee8a9fef90b99d8012006f985",
+ "d6ff3aacbbbadff6d0ccb0873fb9fa2a",
+ "38801f7361052873423d57b574aabddc",
+ "55c76772ecdc1721e92ca04d2fc7c089",
+ // 64xN
+ "4261ecdde34eedc4e5066a93e0f64881",
+ "fe82e012efab872672193316d670fd82",
+ "6c698bc2d4acf4444a64ac55ae9641de",
+ "98626e25101cff69019d1b7e6e439404",
+ // 128xN
+ "fe0f3c89dd39786df1c952a2470d680d",
+ "af7e166fc3d8c9ce85789acf3467ed9d",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest12bpp, Blending) {
+ Test(GetAverageBlendDigest12bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest12bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest12bpp(GetParam()),
+ kNumSpeedTests /
+ (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+ 2,
+ false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest12bpp,
+ testing::ValuesIn(kTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+ return os << ToString(param);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Silence unused function warnings when CdefDirection_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && \
+ !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \
+ (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection))
+constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
+
+int32_t Square(int32_t x) { return x * x; }
+
+template <int bitdepth, typename Pixel>
+void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int32_t cost[8] = {};
+ // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We
+ // use int32_t to keep it simple since |cost| will have to be int32_t.
+ int32_t partial[8][15] = {};
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ const int x = (src[j] >> (bitdepth - 8)) - 128;
+ partial[0][i + j] += x;
+ partial[1][i + j / 2] += x;
+ partial[2][i] += x;
+ partial[3][3 + i - j / 2] += x;
+ partial[4][7 + i - j] += x;
+ partial[5][3 - i / 2 + j] += x;
+ partial[6][j] += x;
+ partial[7][i / 2 + j] += x;
+ }
+ src += stride;
+ }
+ for (int i = 0; i < 8; ++i) {
+ cost[2] += Square(partial[2][i]);
+ cost[6] += Square(partial[6][i]);
+ }
+ cost[2] *= kDivisionTable[7];
+ cost[6] *= kDivisionTable[7];
+ for (int i = 0; i < 7; ++i) {
+ cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+ kDivisionTable[i];
+ cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) *
+ kDivisionTable[i];
+ }
+ cost[0] += Square(partial[0][7]) * kDivisionTable[7];
+ cost[4] += Square(partial[4][7]) * kDivisionTable[7];
+ for (int i = 1; i < 8; i += 2) {
+ for (int j = 0; j < 5; ++j) {
+ cost[i] += Square(partial[i][3 + j]);
+ }
+ cost[i] *= kDivisionTable[7];
+ for (int j = 0; j < 3; ++j) {
+ cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) *
+ kDivisionTable[2 * j + 1];
+ }
+ }
+ int32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+ // (LIBGAV1_MAX_BITDEPTH == 12 &&
+ // !defined(LIBGAV1_Dsp12bpp_CdefDirection))
+
+// Silence unused function warnings when CdefFilter_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \
+ (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters))
+
+int Constrain(int diff, int threshold, int damping) {
+ assert(threshold != 0);
+ damping = std::max(0, damping - FloorLog2(threshold));
+ const int sign = (diff < 0) ? -1 : 1;
+ return sign *
+ Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff));
+}
+
+// Filters the source block. It doesn't check whether the candidate pixel is
+// inside the frame. However it requires the source input to be padded with a
+// constant large value (kCdefLargeValue) if at the boundary.
+template <int block_width, int bitdepth, typename Pixel,
+ bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int block_height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ assert(block_height == 4 || block_height == 8);
+ assert(direction >= 0 && direction <= 7);
+ constexpr int coeff_shift = bitdepth - 8;
+ // Section 5.9.19. CDEF params syntax.
+ assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+ assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+ secondary_strength != 3 << coeff_shift);
+ assert(primary_strength != 0 || secondary_strength != 0);
+ // damping is decreased by 1 for chroma.
+ assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+ (damping >= 2 && damping <= 5 + coeff_shift));
+ // When only primary_strength or secondary_strength are non-zero the number
+ // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and
+ // the taps used don't exceed the amount the sum is
+ // descaled by (16) so we can skip tracking and clipping to the minimum and
+ // maximum value observed.
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+ kCdefSecondaryTap1};
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+ int y = block_height;
+ do {
+ int x = 0;
+ do {
+ int16_t sum = 0;
+ const uint16_t pixel_value = src[x];
+ uint16_t max_value = pixel_value;
+ uint16_t min_value = pixel_value;
+ for (int k = 0; k < 2; ++k) {
+ static constexpr int signs[] = {-1, 1};
+ for (const int& sign : signs) {
+ if (enable_primary) {
+ const int dy = sign * kCdefDirections[direction][k][0];
+ const int dx = sign * kCdefDirections[direction][k][1];
+ const uint16_t value = src[dy * src_stride + dx + x];
+ // Note: the summation can ignore the condition check in SIMD
+ // implementation, because Constrain() will return 0 when
+ // value == kCdefLargeValue.
+ if (value != kCdefLargeValue) {
+ sum += Constrain(value - pixel_value, primary_strength, damping) *
+ kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k];
+ if (clipping_required) {
+ max_value = std::max(value, max_value);
+ min_value = std::min(value, min_value);
+ }
+ }
+ }
+
+ if (enable_secondary) {
+ static constexpr int offsets[] = {-2, 2};
+ for (const int& offset : offsets) {
+ const int dy = sign * kCdefDirections[direction + offset][k][0];
+ const int dx = sign * kCdefDirections[direction + offset][k][1];
+ const uint16_t value = src[dy * src_stride + dx + x];
+ // Note: the summation can ignore the condition check in SIMD
+ // implementation.
+ if (value != kCdefLargeValue) {
+ sum += Constrain(value - pixel_value, secondary_strength,
+ damping) *
+ kCdefSecondaryTaps[k];
+ if (clipping_required) {
+ max_value = std::max(value, max_value);
+ min_value = std::min(value, min_value);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ const int offset = (8 + sum - (sum < 0)) >> 4;
+ if (clipping_required) {
+ dst[x] = static_cast<Pixel>(
+ Clip3(pixel_value + offset, min_value, max_value));
+ } else {
+ dst[x] = static_cast<Pixel>(pixel_value + offset);
+ }
+ } while (++x < block_width);
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--y != 0);
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
+ // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+ // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+ // (LIBGAV1_MAX_BITDEPTH == 12 &&
+ // !defined(LIBGAV1_Dsp12bpp_CdefFilters))
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+ dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+ dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+ dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+ dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void CdefInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CDEF_H_
+#define LIBGAV1_SRC_DSP_CDEF_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/cdef_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
+#include "src/dsp/x86/cdef_sse4.h"
+// clang-format on
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+ kCdefSecondaryTap0 = 2,
+ kCdefSecondaryTap1 = 1,
+};
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CDEF_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+ 840, 420, 280, 210, 168, 140, 120, 105,
+ 120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+ 140, 210, 420, 0};
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kCdefDirectionName[] = "Cdef Direction";
+constexpr char kCdefFilterName[] = "Cdef Filtering";
+constexpr int kTestBufferStride = 8;
+constexpr int kTestBufferSize = 64;
+constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8;
+constexpr int kSourceBufferSize =
+ (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride;
+constexpr int kNumSpeedTests = 5000;
+
+const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
+ static const char* const kDigest[3][2] = {
+ {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
+ {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"},
+ {"5532919a157c4f937da9e822bdb105f7", "dd9dfca6dfca83777d942e693c17627a"}};
+ const int bitdepth_index = (bitdepth - 8) / 2;
+ const int run_index = (num_runs == 1) ? 0 : 1;
+ return kDigest[bitdepth_index][run_index];
+}
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+template <int bitdepth, typename Pixel>
+class CdefDirectionTest : public testing::TestWithParam<int> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ CdefDirectionTest() = default;
+ CdefDirectionTest(const CdefDirectionTest&) = delete;
+ CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
+ ~CdefDirectionTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ CdefInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cdef_direction_ = nullptr;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ CdefInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ CdefInit_AVX2();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ CdefInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_cdef_direction_ = dsp->cdef_direction;
+ }
+
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[kTestBufferSize];
+ int strength_;
+ int size_;
+
+ CdefDirectionFunc base_cdef_direction_;
+ CdefDirectionFunc cur_cdef_direction_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (cur_cdef_direction_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration elapsed_time;
+ libvpx_test::MD5 actual_digest;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) {
+ for (int bits = 0; bits <= bitdepth; ++bits) {
+ for (auto& pixel : buffer_) {
+ pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << bitdepth) - 1);
+ }
+ int output[2] = {};
+ const absl::Time start = absl::Now();
+ cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel),
+ reinterpret_cast<uint8_t*>(&output[0]), &output[1]);
+ elapsed_time += absl::Now() - start;
+ actual_digest.Add(reinterpret_cast<const uint8_t*>(output),
+ sizeof(output));
+ }
+ }
+ }
+ test_utils::CheckMd5Digest(kCdef, kCdefDirectionName,
+ GetDirectionDigest(bitdepth, num_runs),
+ actual_digest.Get(), elapsed_time);
+}
+
+using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>;
+
+TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>;
+
+TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest10bpp, testing::Values(0));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefDirectionTest12bpp = CdefDirectionTest<12, uint16_t>;
+
+TEST_P(CdefDirectionTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest12bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest12bpp, testing::Values(0));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
+ "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54",
+ "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4",
+ "9e3dea21ee4246172121f0420eccd899", "0848bdeffa74145758ef47992e1035c4",
+ "0bb55818de986e9d988b0c1cc6883887", "9b558a7eefc934f90cd09ca26b998bfd",
+ "3a38670f8c5f0c61cc47c9c79da728d2", "ed18fe91180e78008ccb98e9019bed69",
+ "2aa4bbcb6fb088ad42bde76be014dff0", "88f746f0d6c079ab8e9ecc7ff67524c7",
+ "7cffa948f5ddbccc7c6b07d15ca9eb69", "5e22c1c89735965dda935d1249129548",
+ "e765133d133b94e1578c8c5616248a96", "da95d47cad74eb4a075893ca98e658ab",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70",
+ "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8",
+ "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31",
+ "45c047d2be5e2dcf6094937780a3f88a", "346caf437c1ad85862de72a622e29845",
+ "0e9cb69d24d9badbe956da779d912b05", "81803dcb00971237b3fe6372564a842f",
+ "17681ad2ed4a2456d70760852af6c6fd", "5312f8049a08a5f9b1708fda936f7a55",
+ "3f0f522f3a33e4ff2a97bdc1e614c5c4", "3818a50be7fe16aa0c636a7392d1eceb",
+ "c6849b8cd77a076dc7e3c26e8cd55b9e", "223c0dd685bbc74aec1d088356708433",
+ "90992957cb8103222aa2fb43c6cd2fc4", "a4ba6edcefe4130851c4c2607b147f95",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "a32569989c42fd4254979f70c1c65f5a", "dc389048217633e2dd64126376be7d25",
+ "3b0e8dae294895330f349863b1773c39", "9741fe8d27d109cb99b7a9cdc030f52a",
+ "ab70f3729b52287c6432ba7624280a68", "c1e5cf39cbc8030b82e09633c6c67d42",
+ "d5120a196164ff5a0ad7aa8c02e9b064", "1133759f3aee3a362a0ab668f6faf843",
+ "feb0ab7f515665f79fce213e8cd2fb10", "e86ea55c2d6d5cc69716535bd455c99f",
+ "e463da1b9d089b6ee82c041794257fd7", "27800e4af0cceeaf0a95c96275a7befe",
+ "f42e426481db00582b327eb2971bca96", "6127ff289833dde0270000d8240f36b7",
+ "cc5dbaf70e2fef7729a8e2ea9937fbcf", "51850b4e3e2a3919e110376fcb6318d3",
+ "d5ac7ac25eb1b5aee293b2a2ec9de775", "64ecc00b2e24a2f07df833fb50ce09c3",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct CdefTestParam {
+ CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
+ int columns4x4)
+ : subsampling_x(subsampling_x),
+ subsampling_y(subsampling_y),
+ rows4x4(rows4x4),
+ columns4x4(columns4x4) {}
+ int subsampling_x;
+ int subsampling_y;
+ int rows4x4;
+ int columns4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
+ return os << "subsampling(x/y): " << param.subsampling_x << "/"
+ << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4
+ << ", " << param.columns4x4;
+}
+
+// TODO(b/154245961): rework the parameters for this test to match
+// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and
+// primary/secondary strength combinations for both Y and UV.
+template <int bitdepth, typename Pixel>
+class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ CdefFilteringTest() = default;
+ CdefFilteringTest(const CdefFilteringTest&) = delete;
+ CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
+ ~CdefFilteringTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ CdefInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ CdefInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ CdefInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ CdefInit_AVX2();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_));
+ }
+
+ void TestRandomValues(int num_runs);
+
+ uint16_t source_[kSourceBufferSize];
+ Pixel dest_[kMaxPlanes][kTestBufferSize];
+ int primary_strength_;
+ int secondary_strength_;
+ int damping_;
+ int direction_;
+ CdefTestParam param_ = GetParam();
+
+ CdefFilteringFuncs cur_cdef_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ const int id = static_cast<int>(param_.rows4x4 < 4) * 3 +
+ (param_.subsampling_x + param_.subsampling_y) * 6;
+ absl::Duration elapsed_time[kMaxPlanes];
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
+ const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y;
+ const int block_width = 8 >> subsampling_x;
+ const int block_height = 8 >> subsampling_y;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ id + plane);
+ const int offset = 2 * kSourceStride + 2;
+ // Fill boundaries with a large value such that cdef does not take them
+ // into calculation.
+ const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x;
+ const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y;
+ for (int y = 0; y < plane_height; ++y) {
+ for (int x = 0; x < plane_width; ++x) {
+ source_[y * kSourceStride + x + offset] =
+ rnd.Rand16() & ((1 << bitdepth) - 1);
+ }
+ }
+ for (int y = 0; y < 2; ++y) {
+ Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride);
+ Memset(&source_[(y + plane_height + 2) * kSourceStride],
+ kCdefLargeValue, kSourceStride);
+ }
+ for (int y = 0; y < plane_height; ++y) {
+ Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2);
+ Memset(&source_[y * kSourceStride + offset + plane_width],
+ kCdefLargeValue, 2);
+ }
+ do {
+ int strength = rnd.Rand16() & 15;
+ if (strength == 3) ++strength;
+ primary_strength_ = strength << (bitdepth - 8);
+ } while (primary_strength_ == 0);
+ do {
+ int strength = rnd.Rand16() & 3;
+ if (strength == 3) ++strength;
+ secondary_strength_ = strength << (bitdepth - 8);
+ } while (secondary_strength_ == 0);
+ damping_ = (rnd.Rand16() & 3) + 3;
+ direction_ = (rnd.Rand16() & 7);
+
+ memset(dest_[plane], 0, sizeof(dest_[plane]));
+ const absl::Time start = absl::Now();
+ const int width_index = block_width >> 3;
+ if (cur_cdef_filter_[width_index][0] == nullptr) return;
+ cur_cdef_filter_[width_index][0](
+ source_ + offset, kSourceStride, block_height, primary_strength_,
+ secondary_strength_, damping_, direction_, dest_[plane],
+ kTestBufferStride * sizeof(dest_[0][0]));
+ elapsed_time[plane] += absl::Now() - start;
+ }
+ }
+
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(id + plane);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigest10bpp(id + plane);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(id + plane);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(kCdef, kCdefFilterName, expected_digest,
+ reinterpret_cast<uint8_t*>(dest_[plane]),
+ sizeof(dest_[plane]), elapsed_time[plane]);
+ }
+}
+
+// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not
+// supported.
+const CdefTestParam cdef_test_param[] = {
+ CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2),
+ CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2),
+ CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2),
+};
+
+using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>;
+
+TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>;
+
+TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp,
+ testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest10bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefFilteringTest12bpp = CdefFilteringTest<12, uint16_t>;
+
+TEST_P(CdefFilteringTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest12bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest12bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_COMMON_H_
+#define LIBGAV1_SRC_DSP_COMMON_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum { kSgrStride = kRestorationUnitWidth + 32 }; // anonymous enum
+
+// Self guided projection filter.
+struct SgrProjInfo {
+ int index;
+ int multiplier[2];
+};
+
+struct WienerInfo {
+ static const int kVertical = 0;
+ static const int kHorizontal = 1;
+ int16_t number_leading_zero_coefficients[2];
+ alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2];
+};
+
+struct RestorationUnitInfo : public MaxAlignedAllocable {
+ LoopRestorationType type;
+ SgrProjInfo sgr_proj_info;
+ WienerInfo wiener_info;
+};
+
+struct SgrBuffer {
+ alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride];
+ alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride];
+ alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride];
+ alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride];
+ alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
+ alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
+ // The following 2 buffers are only used by the C functions. Since SgrBuffer
+ // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+ // it's OK to always keep the following 2 buffers.
+ alignas(kMaxAlignment) uint8_t ma[kSgrStride]; // [0, 255]
+ // b is less than 2^16 for 8-bit. However, making it a template slows down the
+ // C function by 5%. So b is fixed to 32-bit.
+ alignas(kMaxAlignment) uint32_t b[kSgrStride];
+};
+
+union RestorationBuffer {
+ // For self-guided filter.
+ SgrBuffer sgr_buffer;
+ // For wiener filter.
+ // The array |intermediate| in Section 7.17.4, the intermediate results
+ // between the horizontal and vertical filters.
+ alignas(kMaxAlignment) int16_t
+ wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) *
+ kRestorationUnitWidth];
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_COMMON_H_
--- /dev/null
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/match.h"
+#include "gtest/gtest.h"
+#include "src/dsp/x86/common_avx2_test.h"
+#include "src/dsp/x86/common_sse4_test.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+class CommonDspTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->name();
+ if (absl::StartsWith(test_case, "SSE41")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ } else if (absl::StartsWith(test_case, "AVX2")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CommonDspTest);
+
+#if LIBGAV1_ENABLE_AVX2
+TEST_F(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+ AVX2RightShiftWithRoundingS16Test();
+}
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_ENABLE_SSE4_1
+TEST_F(CommonDspTest, SSE41RightShiftWithRoundingS16) {
+ SSE41RightShiftWithRoundingS16Test();
+}
+#endif // LIBGAV1_ENABLE_SSE41
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/constants.h"
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// Each set of 7 taps is padded with a 0 to easily align and pack into the high
+// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+ {{-6, 10, 0, 0, 0, 12, 0, 0},
+ {-5, 2, 10, 0, 0, 9, 0, 0},
+ {-3, 1, 1, 10, 0, 7, 0, 0},
+ {-3, 1, 1, 2, 10, 5, 0, 0},
+ {-4, 6, 0, 0, 0, 2, 12, 0},
+ {-3, 2, 6, 0, 0, 2, 9, 0},
+ {-3, 2, 2, 6, 0, 2, 7, 0},
+ {-3, 1, 2, 2, 6, 3, 5, 0}},
+ {{-10, 16, 0, 0, 0, 10, 0, 0},
+ {-6, 0, 16, 0, 0, 6, 0, 0},
+ {-4, 0, 0, 16, 0, 4, 0, 0},
+ {-2, 0, 0, 0, 16, 2, 0, 0},
+ {-10, 16, 0, 0, 0, 0, 10, 0},
+ {-6, 0, 16, 0, 0, 0, 6, 0},
+ {-4, 0, 0, 16, 0, 0, 4, 0},
+ {-2, 0, 0, 0, 16, 0, 2, 0}},
+ {{-8, 8, 0, 0, 0, 16, 0, 0},
+ {-8, 0, 8, 0, 0, 16, 0, 0},
+ {-8, 0, 0, 8, 0, 16, 0, 0},
+ {-8, 0, 0, 0, 8, 16, 0, 0},
+ {-4, 4, 0, 0, 0, 0, 16, 0},
+ {-4, 0, 4, 0, 0, 0, 16, 0},
+ {-4, 0, 0, 4, 0, 0, 16, 0},
+ {-4, 0, 0, 0, 4, 0, 16, 0}},
+ {{-2, 8, 0, 0, 0, 10, 0, 0},
+ {-1, 3, 8, 0, 0, 6, 0, 0},
+ {-1, 2, 3, 8, 0, 4, 0, 0},
+ {0, 1, 2, 3, 8, 2, 0, 0},
+ {-1, 4, 0, 0, 0, 3, 10, 0},
+ {-1, 3, 4, 0, 0, 4, 6, 0},
+ {-1, 2, 3, 4, 0, 4, 4, 0},
+ {-1, 2, 2, 3, 4, 3, 3, 0}},
+ {{-12, 14, 0, 0, 0, 14, 0, 0},
+ {-10, 0, 14, 0, 0, 12, 0, 0},
+ {-9, 0, 0, 14, 0, 11, 0, 0},
+ {-8, 0, 0, 0, 14, 10, 0, 0},
+ {-10, 12, 0, 0, 0, 0, 14, 0},
+ {-9, 1, 12, 0, 0, 0, 12, 0},
+ {-8, 0, 0, 12, 0, 1, 11, 0},
+ {-7, 0, 0, 1, 12, 1, 9, 0}}};
+
+// A lookup table replacing the calculation of the variable s in Section 7.17.3
+// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set
+// syntax element in the Spec, saved in the sgr_proj_info.index field of a
+// RestorationUnitInfo struct). The second index is pass (0 or 1).
+//
+// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
+// const uint32_t n2_with_scale = n * n * scale;
+// const uint32_t s =
+// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
+// 0 is an invalid value, corresponding to radius = 0, where the filter is
+// skipped.
+const uint16_t kSgrScaleParameter[16][2] = {
+ {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177},
+ {47, 1079}, {37, 996}, {30, 925}, {25, 863}, {0, 2589}, {0, 1618},
+ {0, 1177}, {0, 925}, {56, 0}, {22, 0},
+};
+
+const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
+
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+ {{1, 0}, {2, 0}}, // Padding: Cdef_Directions[6]
+ {{1, 0}, {2, -1}}, // Padding: Cdef_Directions[7]
+ {{-1, 1}, {-2, 2}}, // Begin Cdef_Directions
+ {{0, 1}, {-1, 2}}, //
+ {{0, 1}, {0, 2}}, //
+ {{0, 1}, {1, 2}}, //
+ {{1, 1}, {2, 2}}, //
+ {{1, 0}, {2, 1}}, //
+ {{1, 0}, {2, 0}}, //
+ {{1, 0}, {2, -1}}, // End Cdef_Directions
+ {{-1, 1}, {-2, 2}}, // Padding: Cdef_Directions[0]
+ {{0, 1}, {-1, 2}}, // Padding: Cdef_Directions[1]
+};
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_
+#define LIBGAV1_SRC_DSP_CONSTANTS_H_
+
+// This file contains DSP related constants that have a direct relationship with
+// a DSP component.
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+enum {
+ kCflLumaBufferStride = 32,
+}; // anonymous enum
+
+extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
+
+// Values in this enum can be derived as the sum of subsampling_x and
+// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never
+// allowed by the bitstream).
+enum SubsamplingType : uint8_t {
+ kSubsamplingType444, // subsampling_x = 0, subsampling_y = 0.
+ kSubsamplingType422, // subsampling_x = 1, subsampling_y = 0.
+ kSubsamplingType420, // subsampling_x = 1, subsampling_y = 1.
+ kNumSubsamplingTypes
+};
+
+extern const uint16_t kSgrScaleParameter[16][2];
+
+extern const uint8_t kCdefPrimaryTaps[2][2];
+
+extern const int8_t kCdefDirectionsPadded[12][2][2];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CONSTANTS_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+constexpr int kVerticalOffset = 3;
+
+// Compound prediction output ranges from ConvolveTest.ShowRange.
+// In some cases, the horizontal or vertical filter will be omitted. This table
+// shows the general case, where the downscaled horizontal output is input to
+// the vertical filter via the |intermediate_result| array. The final output is
+// either Pixel or compound values, depending on the |is_compound| variable.
+// Bitdepth: 8 Input range: [ 0, 255]
+// Horizontal upscaled range: [ -7140, 23460]
+// Horizontal downscaled range: [ -1785, 5865]
+// Vertical upscaled range: [ -328440, 589560]
+// Pixel output range: [ 0, 255]
+// Compound output range: [ -5132, 9212]
+//
+// Bitdepth: 10 Input range: [ 0, 1023]
+// Horizontal upscaled range: [ -28644, 94116]
+// Horizontal downscaled range: [ -7161, 23529]
+// Vertical upscaled range: [-1317624, 2365176]
+// Pixel output range: [ 0, 1023]
+// Compound output range: [ 3988, 61532]
+//
+// Bitdepth: 12 Input range: [ 0, 4095]
+// Horizontal upscaled range: [ -114660, 376740]
+// Horizontal downscaled range: [ -7166, 23546]
+// Vertical upscaled range: [-1318560, 2366880]
+// Pixel output range: [ 0, 4095]
+// Compound output range: [ 3974, 61559]
+
+template <int bitdepth, typename Pixel>
+void ConvolveScale2D_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x, const int step_y,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + 8)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+ const int max_pixel_value = (1 << bitdepth) - 1;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ // Note: assume the input src is already aligned to the correct start
+ // position.
+ int y = 0;
+ do {
+ int p = subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ p += step_x;
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ int p = subpixel_y & 1023;
+ y = 0;
+ do {
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum +=
+ kHalfSubPixelFilters[filter_index][filter_id][k] *
+ intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+ x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ dest += dest_stride;
+ p += step_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundScale2D_C(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + 8)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ // Note: assume the input src is already aligned to the correct start
+ // position.
+ int y = 0;
+ do {
+ int p = subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ p += step_x;
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ int p = subpixel_y & 1023;
+ y = 0;
+ do {
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum +=
+ kHalfSubPixelFilters[filter_index][filter_id][k] *
+ intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+ x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ dest += pred_stride;
+ p += step_y;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompound2D_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+ const int intermediate_height = height + kSubPixelTaps - 1;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src = static_cast<const Pixel*>(reference) -
+ kVerticalOffset * src_stride - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ intermediate[k * intermediate_stride + x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ dest += pred_stride;
+ intermediate += intermediate_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is single prediction mode, where both horizontal and
+// vertical filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void Convolve2D_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id, const int vertical_filter_id,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ const int intermediate_height = height + kSubPixelTaps - 1;
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+ const int max_pixel_value = (1 << bitdepth) - 1;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src = static_cast<const Pixel*>(reference) -
+ kVerticalOffset * src_stride - kHorizontalOffset;
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += intermediate_stride;
+ } while (++y < intermediate_height);
+
+ // Vertical filter.
+ filter_index = GetFilterIndex(vertical_filter_index, height);
+ intermediate = intermediate_result;
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ intermediate[k * intermediate_stride + x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ dest += dest_stride;
+ intermediate += intermediate_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only horizontal
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveHorizontal_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int bits = kFilterBits - kRoundBitsHorizontal;
+ const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int max_pixel_value = (1 << bitdepth) - 1;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveVertical_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src =
+ static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ // Copy filters must call ConvolveCopy().
+ assert(vertical_filter_id != 0);
+
+ const int max_pixel_value = (1 << bitdepth) - 1;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ src[k * src_stride + x];
+ }
+ dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
+ max_pixel_value);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCopy_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ int y = 0;
+ do {
+ memcpy(dest, src, width * sizeof(Pixel));
+ src += reference_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundCopy_C(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsVertical =
+ ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical) -
+ kInterRoundBitsCompoundVertical;
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
+ sum += src[x];
+ dest[x] = sum << kRoundBitsVertical;
+ } while (++x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only horizontal
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundHorizontal_C(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<uint16_t*>(prediction);
+ // Copy filters must call ConvolveCopy().
+ assert(horizontal_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only vertical
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundVertical_C(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ const auto* src =
+ static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ // Copy filters must call ConvolveCopy().
+ assert(vertical_filter_id != 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ int sum = 0;
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+ src[k * src_stride + x];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dest[x] = sum;
+ } while (++x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from current frame and both horizontal and vertical
+// filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveIntraBlockCopy2D_C(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const int intermediate_height = height + 1;
+ uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + 1)];
+ uint16_t* intermediate = intermediate_result;
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ intermediate[x] = src[x] + src[x + 1];
+ } while (++x < width);
+
+ src += src_stride;
+ intermediate += width;
+ } while (++y < intermediate_height);
+
+ intermediate = intermediate_result;
+ y = 0;
+ do {
+ int x = 0;
+ do {
+ dest[x] =
+ RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
+ } while (++x < width);
+
+ intermediate += width;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from the current frame and only horizontal or vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+// The filtering of intra block copy is simply the average of current and
+// the next pixel.
+template <int bitdepth, typename Pixel, bool is_horizontal>
+void ConvolveIntraBlockCopy1D_C(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+ const auto* src = static_cast<const Pixel*>(reference);
+ const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+ auto* dest = static_cast<Pixel*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+ const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
+ } while (++x < width);
+
+ src += src_stride;
+ dest += dest_stride;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void ConvolveInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
+#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+
+#include <cassert>
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/convolve_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
+#include "src/dsp/x86/convolve_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not
+// thread-safe.
+void ConvolveInit_C();
+
+inline int GetNumTapsInFilter(const int filter_index) {
+ if (filter_index < 2) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_CONVOLVE_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels;
+constexpr int kIntermediateStride = 8;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The convolve function will access at most (block_height + 7) rows/columns
+// from the beginning.
+constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+
+// Test all the filters in |kSubPixelFilters|. There are 6 different filters but
+// filters [4] and [5] are only reached through GetFilterIndex().
+constexpr int kMinimumViableRuns = 4 * 16;
+
+struct ConvolveTestParam {
+ enum BlockSize {
+ kBlockSize2x2,
+ kBlockSize2x4,
+ kBlockSize4x2,
+ kBlockSize4x4,
+ kBlockSize4x8,
+ kBlockSize8x2,
+ kBlockSize8x4,
+ kBlockSize8x8,
+ kBlockSize8x16,
+ kBlockSize16x8,
+ kBlockSize16x16,
+ kBlockSize16x32,
+ kBlockSize32x16,
+ kBlockSize32x32,
+ kBlockSize32x64,
+ kBlockSize64x32,
+ kBlockSize64x64,
+ kBlockSize64x128,
+ kBlockSize128x64,
+ kBlockSize128x128,
+ kNumBlockSizes
+ };
+
+ static constexpr int kBlockWidth[kNumBlockSizes] = {
+ 2, 2, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128};
+ static constexpr int kBlockHeight[kNumBlockSizes] = {
+ 2, 4, 2, 4, 8, 2, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128};
+
+ explicit ConvolveTestParam(BlockSize block_size)
+ : block_size(block_size),
+ width(kBlockWidth[block_size]),
+ height(kBlockHeight[block_size]) {}
+
+ BlockSize block_size;
+ int width;
+ int height;
+};
+
+#if !LIBGAV1_CXX17
+constexpr int ConvolveTestParam::kBlockWidth[kNumBlockSizes]; // static.
+constexpr int ConvolveTestParam::kBlockHeight[kNumBlockSizes]; // static.
+#endif
+
+const char* GetConvolveDigest8bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+ "ae5977a4ceffbac0cde72a04a43a9d57", "6cf5f791fe0d8dcd3526be3c6b814035",
+ "d905dfcad930aded7718587c05b48aaf", "6baf153feff04cc5b7e87c0bb60a905d",
+ "871ed5a69ca31e6444faa720895949bf", "c9cf1deba08dac5972b3b0a43eff8f98",
+ "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+ "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+ "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+ "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+ "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+ "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+ "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+ "1168251e6261e2ff1fa69a93226dbd76", "4821befdf63f8c6da6440afeb57f320f",
+ "c30fc44d83821141e84cc4793e127301", "a8293b933d9f2e5d7f922ea40111d643",
+ "354a54861a94e8b027afd9931e61f997", "b384e9e3d81f9f4f9024028fbe451d8b",
+ "eeeb8589c1b31cbb565154736ca939ec", "f49dab626ddd977ed171f79295c24935",
+ "78d2f27e0d4708cb16856d7d40dc16fb", "9d2393ea156a1c2083f5b4207793064b",
+ "a9c62745b95c66fa497a524886af57e2", "2c614ec4463386ec075a0f1dbb587933",
+ "7a8856480d752153370240b066b90f6a", "beaef1dbffadc701fccb7c18a03e3a41",
+ "72b1e700c949d06eaf62d664dafdb5b6", "684f5c3a25a080edaf79add6e9137a8e",
+ "3be970f49e4288988818b087201d54da", "d2b9dba2968894a414756bb510ac389a",
+ "9a3215eb97aedbbddd76c7440837d040", "4e317feac6da46addf0e8b9d8d54304b",
+ "d2f5ca2b7958c332a3fb771f66da01f0", "7aec92c3b65e456b64ae285c12b03b0d",
+ "f72a99ad63f6a88c23724e898b705d21", "07a1f07f114c4a38ba08d2f44e1e1132",
+ "26b9de95edb45b31ac5aa19825831c7a", "4e4677a0623d44237eb8d6a622cdc526",
+ "c1b836a6ce023663b90db0e320389414", "5befcf222152ebc8d779fcc10b95320a",
+ "62adf407fc27d8682ced4dd7b55af14e", "35be0786a072bf2f1286989261bf6580",
+ "90562fc42dc5d879ae74c4909c1dec30", "a1427352f9e413975a0949e2b300c657",
+ "bcbc418bc2beb243e463851cd95335a9", "cb8fedcbecee3947358dc61f95e56530",
+ "0d0154a7d573685285a83a4cf201ac57", "b14bd8068f108905682b83cc15778065",
+ "c96c867d998473197dde9b587be14e3a", "f596c63c7b14cada0174e17124c83942",
+ "eb2822ad8204ed4ecbf0f30fcb210498", "538ce869ffd23b6963e61badfab7712b",
+ "6bbcc075f8b768a02cdc9149f150326d", "4ae70d9db2ec36885394db7d59bdd4f7",
+ "5fee162fe52c11c823db4d5ede370654", "9365186c59ef66d9def40f437022ad93",
+ "0f95fb0276c9c7910937fbdf75f2811d", "356d4003477283e157c8d2b5a79d913c",
+ "b355dab2dbb6f5869018563eece22862", "cf6ff8c43d8059cea6090a23ab66a0ef",
+ "a336f8b7bcf188840ca65c0d0e66518a", "de953f03895923359c6a719e6a537b89",
+ "8463ade9347ed602663e2cec5c4c3fe6", "392de11ffcd5c2ecf3db3480ee135340",
+ "bddd31e3e852712e6244b616622af83d", "30a36245c40d978fc8976b442a8600c3",
+ "93aa662b988b8502e5ea95659eafde59", "70440ba9ee7f9d16d297dbb49e54a56e",
+ "1eb2be4c05b50e427e29c72fa566bff5", "52c0980bae63e8459e82eee7d8af2334",
+ "75e57104d6058cd2bce1d3d8142d273d", "b4c735269ade44419169adbd852d5ddc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a7305087fae23de53d21a6909009ff69",
+ "8dcce009395264379c1a51239f4bb22c", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "8dcce009395264379c1a51239f4bb22c", "d90a69e7bae8aa46ed0e1e5f911d7a07",
+ "6ab4dc87be03be1dcc5d956ca819d938", "6ab4dc87be03be1dcc5d956ca819d938",
+ "8f2afdb2f03cd04ffacd421b958caaa0", "710ccecc103033088d898a2b924551fb",
+ "710ccecc103033088d898a2b924551fb", "a4093e3e5902dd659407ce6471635a4e",
+ "375d7f5358d7a088a498b8b3aaecc0d5", "375d7f5358d7a088a498b8b3aaecc0d5",
+ "08867ea5cc38c705ec52af821bc4736a", "2afb540e8063f58d1b03896486c5e89b",
+ "2afb540e8063f58d1b03896486c5e89b", "6ce47b11d2e60c5d183c84ce9f2e46cc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a5a1ac658d7ce4a846a32b9fcfaa3475",
+ "2370f4e4a83edf91b7f504bbe4b00e90", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "ae5464066a049622a7a264cdf9394b55", "45368b6db3d1fee739a64b0bc823ea9c",
+ "8dff0f28192d9f8c0bf7fb5405719dd8", "632738ef3ff3021cff45045c41978849",
+ "f7ec43384037e8d6c618e0df826ec029", "a6bc648197781a2dc99c487e66464320",
+ "1112ebd509007154c72c5a485b220b62", "9714c4ce636b6fb0ad05cba246d48c76",
+ "2c93dde8884f09fb5bb5ad6d95cde86d", "a49e6160b5d1b56bc2046963101cd606",
+ "7f084953976111e9f65b57876e7552b1", "0846ec82555b66197c5c45b08240fbcc",
+ "ca7471c126ccd22189e874f0a6e41960", "0802b6318fbd0969a33de8fdfcd07f10",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3b1ceebf0579fcbbfd6136938c595b91",
+ "ecafabcad1045f15d31ce2f3b13132f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "5f211eba020e256a5781b203c5aa1d2e", "3b04497634364dd2cd3f2482b5d4b32f",
+ "a8ac7b5dc65ffb758b0643508a0e744e", "561ed8be43c221a561f8885a0d74c7ef",
+ "8159619fc234598c8c75154d80021fd4", "8f43645dce92cf7594aa4822aa53b17d",
+ "b6ccddb7dfa4eddc87b4eff08b5a3195", "b4e605327b28db573d88844a1a09db8d",
+ "15b00a15d1cc6cc96ca85d00b167e4dd", "7bf911888c11a9fefd604b8b9c82e9a1",
+ "bfb69b4d7d4aed73cfa75a0f55b66440", "034d1d62581bd0d840c4cf1e28227931",
+ "8cba849640e9e2859d509bc81ca94acd", "bc79acf2a0fe419194cdb4529bc7dcc8",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3bfad931bce82335219e0e29c15f2b21",
+ "68a701313d2247d2b32636ebc1f2a008", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "05afe1f40d37a45a97a5e0aadd5066fb", "9e1f0e0bddb58d15d0925eeaede9b84c",
+ "03313cdaa593a1a7b4869010dcc7b241", "88a50d2b4107ee5b5074b2520183f8ac",
+ "ac50ea9f7306da95a5092709442989cf", "739b17591437edffd36799237b962658",
+ "b8a7eb7dd9c216e240517edfc6489397", "75b755f199dbf4a0e5ebbb86c2bd871d",
+ "31b0017ba1110e3d70b020901bc15564", "0a1aa8f5ecfd11ddba080af0051c576a",
+ "536181ee90de883cc383787aec089221", "29f82b0f3e4113944bd28aacd9b8489a",
+ "ee3e76371240d1f1ff811cea6a7d4f63", "17a20dbbf09feae557d40aa5818fbe76",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "6baf153feff04cc5b7e87c0bb60a905d",
+ "871ed5a69ca31e6444faa720895949bf", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+ "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+ "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+ "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+ "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+ "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+ "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "4cfad2c437084a93ea76913e21c2dd89",
+ "d372f0c17bce98855d6d59fbee814c3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "d99ffd2579eb781c30bc0df7b76ad61e", "4e139e57cbb049a0f4ef816adc48d026",
+ "be53b2507048e7ff50226d15c0b28865", "b73f3c1a10405de89d1f9e812ff73b5a",
+ "c7d51b1f2df49ab83962257e8a5934e5", "159e443d79cc59b11ca4a80aa7aa09be",
+ "6ef14b14882e1465b0482b0e0b16d8ce", "22a8d287b425c870f40c64a50f91ce54",
+ "f1d96db5a2e0a2160df38bd96d28d19b", "637d1e5221422dfe9a6dbcfd7f62ebdd",
+ "f275af4f1f350ffaaf650310cb5dddec", "f81c4d6b001a14584528880fa6988a87",
+ "a5a2f9c2e7759d8a3dec1bc4b56be587", "2317c57ab69a36eb3bf278cf8a8795a3",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "1a0bdfc96a3b9fd904e658f238ab1076",
+ "56d16e54afe205e97527902770e71c71", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "79e9e260a2028c5fe320005c272064b9",
+ "2418ebcdf85551b9ae6e3725f04aae6d", "98bdf907ebacacb734c9eef1ee727c6e",
+ "4dd5672d53c8f359e8f80badaa843dfc", "a1bef519bbf07138e2eec5a91694de46",
+ "df1cb51fe1a937cd7834e973dc5cb814", "317fe65abf81ef3ea07976ef8667baeb",
+ "2da29da97806ae0ee300c5e69c35a4aa", "555475f5d1685638169ab904447e4f13",
+ "b3e3a6234e8045e6182cf90a09f767b2", "849dfeca59074525dea59681a7f88ab4",
+ "39a68af80be11e1682b6f3c4ede33530", "b22d765af176d87e7d3048b4b89b86ad",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b8a710baa6a9fc784909671d450ecd99",
+ "f9e6a56382d8d12da676d6631bb6ef75", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "3bf8e11e18527b16f0d7c0361d74a52d", "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2",
+ "06ef1504f31af5f173d3317866ca57cb", "635e8ee11cf04d73598549234ad732a0",
+ "fab693410d59ee88aa2895527efc31ac", "3041eb26c23a63a587fbec623919e2d2",
+ "c61d99d5daf575664fb7ad64976f4b03", "822f6c4eb5db760468d822b21f48d94d",
+ "3f6fcb9fae3666e085b9e29002a802fc", "d9b9fecd195736a6049c528d4cb886b5",
+ "fed17fc391e6c3db4aa14ea1d6596c87", "d0d3482d981989e117cbb32fc4550267",
+ "39561688bf6680054edbfae6035316ce", "087c5992ca6f829e1ba4ba5332d67947",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest8bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+ "0291a23f2ac4c40b5d8e957e63769904", "1d48447857472d6455af10d5526f6827",
+ "409b2278d6d372248f1891ca0dd12760", "9e416606a3f82fe5bb3f7182e4f42c2d",
+ "e126563f859ddd5c5ffde6f641168fad", "9bad4f1b7e1865f814b6fd5620816ebd",
+ "50e5e5a57185477cb2af83490c33b47c", "3d2fb301c61d7fbd0e21ac263f7ac552",
+ "5920032c6432c80c6e5e61b684018d13", "07ada64d24339488cdce492e6e0c6b0d",
+ "aaf1589aff6d062a87c627ab9ba20e3e", "91adf91bb24d2c4ea3f882bdf7396e33",
+ "1d17a932a68bb1f199f709e7725fe44b", "07716c63afda034cb386511ea25a63b5",
+ "cca17ef3324c41d189e674a059ef1255", "37d17e70619823a606c0b5f74bf2e33b",
+ "ba8ed5474c187c8e8d7f82a6a29ee860", "27663f037973ebe82ec10252a4d91299",
+ "24c27e187e8d5a2bbfa0fef9046d3eb0", "9854fdc91a48e3bd4639edcc940e5c09",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a71907c60a9f1f81972a2859ae54a805",
+ "817bc3bf0c77abc4186eac39f2320184", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "4e7182a8b226982e2678abcf5f83325d", "50cef7c6e57544135f102226bb95bed9",
+ "225e054dbcfff05b1c8b0792c731449e", "16eb63f03839159f3af0e08be857170f",
+ "c8e5d111a2e3f4487330a8bd893cb894", "4fd99eaf9c160442aab35b9bdc5d275b",
+ "8b0f61bfb30747d4c9215618ac42557c", "1df78022da202cefb9a8100b114152d9",
+ "378466e1eda63dbc03565b78af8e723f", "28ea721411fbf5fc805035be9a384140",
+ "4fed5d4163a3bfcc6726a42f20410b0a", "55abfca0c820771bd926e4b94f66a499",
+ "6c8b8ef0a78859c768e629e1decc0019", "d0ead286b5ba3841d24dd114efbfef0a",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetConvolveDigest10bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+ "b1b6903d60501c7bc11e5285beb26a52", "a7855ed75772d7fa815978a202bbcd9f",
+ "bde291a4e8087c085fe8b3632f4d7351", "238980eebc9e63ae3eea2771c7a70f12",
+ "0eac13431bd7d8a573318408a72246d5", "d05a237ed7a9ca877256b71555b1b8e4",
+ "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+ "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+ "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+ "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+ "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+ "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+ "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+ "a693b4bd0334a3b98d45e67d3985bb63", "156de3172d9acf3c7f251cd7a18ad461",
+ "e545b8a3ff958f8363c7968cbae96732", "7842b2047356c1417d9d88219707f1a1",
+ "1a487c658d684314d91bb6d961a94672", "94b3e5bcd6b849b66a4571ec3d23f9be",
+ "0635a296be01b7e641de98ee27c33cd2", "82dc120bf8c2043bc5eee81007309ebf",
+ "58c826cad3c14cdf26a649265758c58b", "f166254037c0dfb140f54cd7b08bddfe",
+ "74ab206f14ac5f62653cd3dd71a7916d", "5621caef7cc1d6522903290ccc5c2cb8",
+ "78ec6cf42cce4b1feb65e076c78ca241", "42188e2dbb4e02cd353552ea147ad03f",
+ "f9813870fc27941a7c00a0443d7c2fe7", "20b14a6b5af7aa356963bcaaf23d230d",
+ "9c9c41435697f75fa118b6d6464ee7cb", "38816245ed832ba313fefafcbed1e5c8",
+ "5d34137cc8ddba75347b0fa1d0a91791", "465dcb046a0449b9dfb3e0b297aa3863",
+ "3e787534dff83c22b3033750e448865a", "4c91f676a054d582bcae1ca9adb87a31",
+ "eab5894046a99ad0a1a12c91b0f37bd7", "765b4cfbfc1a4988878c412d53bcb597",
+ "bc63b29ec78c1efec5543885a45bb822", "91d6bdbc62d4bb80c9b371d9704e3c9e",
+ "cecd57396a0033456408f3f3554c6912", "5b37f94ef136c1eb9a6181c19491459c",
+ "716ba3a25b454e44b46caa42622c128c", "9076f58c4ab20f2f06d701a6b53b1c4f",
+ "d3212ab3922f147c3cf126c3b1aa17f6", "b55fea77f0e14a8bf8b6562b766fe91f",
+ "59b578268ff26a1e21c5b4273f73f852", "16761e7c8ba2645718153bed83ae78f6",
+ "a9e9805769fe1baf5c7933793ccca0d8", "553a2c24939dff18ec5833c77f556cfb",
+ "5c1ec75a160c444fa90abf106fa1140e", "2266840f11ac4c066d941ec473b1a54f",
+ "9e194755b2a37b615a517d5f8746dfbb", "bbf86f8174334f0b8d869fd8d58bf92d",
+ "fd1da8d197cb385f7917cd296d67afb9", "a984202c527b757337c605443f376915",
+ "c347f4a58fd784c5e88c1a23e4ff15d2", "29cbaadbff9adf4a3d49bd9900a9dd0b",
+ "c5997b802a6ba1cf5ba1057ddc5baa7e", "4f750f6375524311d260306deb233861",
+ "59f33727e5beeb783a057770bec7b4cd", "0654d72f22306b28d9ae42515845240c",
+ "6c9d7d9e6ef81d76e775a85c53abe209", "a35f435ccc67717a49251a07e62ae204",
+ "c5325015cb0b7c42839ac4aa21803fa0", "f81f31f1585c0f70438c09e829416f20",
+ "ab10b22fb8dd8199040745565b28595d", "0d928d6111f86c60ccefc6c6604d5659",
+ "4ed1a6200912995d4f571bdb7822aa83", "92e31a45513582f386dc9c22a57bbbbd",
+ "6dbf310a9c8d85f76306d6a35545f8af", "80fce29dc82d5857c1ed5ef2aea16835",
+ "14f2c5b9d2cd621c178a39f1ec0c38eb", "da54cfb4530841bda29966cfa05f4879",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7e3fa9c03bc3dfbdeb67f24c5d9a49cd",
+ "f3454ca93cbb0c8c09b0695d90a0df3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "f3454ca93cbb0c8c09b0695d90a0df3d", "1a77d2af4d2b6cf8737cfbcacacdc4e4",
+ "89bec831efea2f88129dedcad06bb3fa", "89bec831efea2f88129dedcad06bb3fa",
+ "dead0fe4030085c22e92d16bb110de9d", "306a2f5dfd675df4ed9af44fd5cac8c0",
+ "306a2f5dfd675df4ed9af44fd5cac8c0", "9d01c946a12f5ef9d9cebd9816e06014",
+ "768f63912e43148c13688d7f23281531", "768f63912e43148c13688d7f23281531",
+ "2e7927158e7b8e40e7269fc909fb584b", "123028e18c2bfb334e34adb5a4f67de4",
+ "123028e18c2bfb334e34adb5a4f67de4", "2c979c2bddef79a760e72a802f83cc76",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "da1a6ff2be03ec8acde4cb1cd519a6f0",
+ "a4ca37cb869a0dbd1c4a2dcc449a8f31", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "1b5d1d4c7be8d5ec00a42a49eecf918f", "98b77e88b0784baaea64c98c8707fe46",
+ "8148788044522edc3c497e1017efe2ce", "acf60abeda98bbea161139b915317423",
+ "262c96b1f2c4f85c86c0e9c77fedff1e", "f35a3d13516440f9168076d9b07c9e98",
+ "13782526fc2726100cb3cf375b3150ed", "13c07441b47b0c1ed80f015ac302d220",
+ "02880fde51ac991ad18d8986f4e5145c", "aa25073115bad49432953254e7dce0bc",
+ "69e3361b7199e10e75685b90fb0df623", "2f8ab35f6e7030e82ca922a68b29af4a",
+ "452f91b01833c57db4e909575a029ff6", "1fabf0655bedb671e4d7287fec8119ba",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "d54206c34785cc3d8a06c2ceac46378c",
+ "85a11892ed884e3e74968435f6b16e64", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "16434230d24b9522ae2680e8c37e1b95", "963dea92f3efbb99137d1de9c56728d3",
+ "b72fb6a9a073c2fe65013af1842dc9b0", "86fa0c299737eb499cbcdce94abe2d33",
+ "6b80af04470b83673d98f46925e678a5", "65baca6167fe5249f7a839ce5b2fd591",
+ "e47ded6c0eec1d5baadd02aff172f2b1", "c0950e609f278efb7050d319a9756bb3",
+ "9051290279237f9fb1389989b142d2dd", "34cdc1be291c95981c98812c5c343a15",
+ "5b64a6911cb7c3d60bb8f961ed9782a2", "7133de9d03a4b07716a12226b5e493e8",
+ "3594eff52d5ed875bd9655ddbf106fae", "90d7e13aa2f9a064493ff2b3b5b12109",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b1f26ee13df2e14a757416ba8a682278",
+ "996b6c166f9ed25bd07ea6acdf7597ff", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "34895d4c69a6c3303693e6f431bcd5d8", "c9497b00cb1bc3363dd126ffdddadc8e",
+ "1e461869bb2ee9b6069c5e52cf817291", "8d7f1d7ea6a0dcc922ad5d2e77bc74dd",
+ "138855d9bf0ccd0c62ac14c7bff4fd37", "64035142864914d05a48ef8e013631d0",
+ "205904fa3c644433b46e01c11dd2fe40", "291425aaf8206b20e88db8ebf3cf7e7f",
+ "cb6238b8eb6b72980958e6fcceb2f2eb", "626321a6dfac542d0fc70321fac13ff3",
+ "1c6fda7501e0f8bdad972f7857cd9354", "4fd485dadcb570e5a0a5addaf9ba84da",
+ "d3f140aea9e8eabf4e1e5190e0148288", "e4938219593bbed5ae638a93f2f4a580",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "238980eebc9e63ae3eea2771c7a70f12",
+ "0eac13431bd7d8a573318408a72246d5", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+ "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+ "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+ "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+ "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+ "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+ "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "e552466a4e7ff187251b8914b084d404",
+ "981b7c44b6f7b7ac2acf0cc4096e6bf4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "a4c75372af36162831cb872e24e1088c", "497271227a70a72f9ad25b415d41563f",
+ "c48bd7e11ec44ba7b2bc8b6a04592439", "0960a9af91250e9faa1eaac32227bf6f",
+ "746c2e0f96ae2246d534d67102be068c", "d6f6db079da9b8909a153c07cc9d0e63",
+ "7c8928a0d769f4264d195f39cb68a772", "db645c96fc8be04015e0eb538afec9ae",
+ "946af3a8f5362def5f4e27cb0fd4e754", "7ad78dfe7bbedf696dd58d9ad01bcfba",
+ "f0fd9c09d454e4ce918faa97e9ac10be", "af6ae5c0eb28417bd251184baf2eaba7",
+ "866f8df540dd3b58ab1339314d139cbd", "72803589b453a29501540aeddc23e6f4",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "aba5d5ef5e96fe418e65d20e506ea834",
+ "d70bf16e2a31e90b7b3cdeaef1494cf9", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "6df80bb7f264f4f285d09a4d61533fae", "c8831118d1004a7cca015a4fca140018",
+ "b7f82c140369067c105c7967c75b6f9e", "130f47aae365aabfec4360fa5b5ff554",
+ "92483ed631de21b685ffe6ccadbbec8f", "cbb6ab31547df6b91cfb48630fdffb48",
+ "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "9e193b6b28ce798c44c744efde19eee9",
+ "885c384d90aaa34acd8303958033c252", "8110ed10e7234851dff3c7e4a51108a2",
+ "6fb9383302eb7e7a13387464d2634e03", "864d51fcc737bc73a3f588b67515039a",
+ "2ecb7890f00234bcb28c1d969f489012", "c4793d431dbf2d88826bb440bf027512",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "972aeba65e8a6d20dd0f95279be2aa75",
+ "34165457282e2af2e9b3f5840e4dec5d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "b8c5582b9bbb789c45471f93be83b41f", "257bf5467db570974d7cf2356bacf116",
+ "5255dded79f56b0078543b5a1814a668", "ef745100f5f34c8ff841b2b0b57eb33f",
+ "edae8ed67286ca6a31573a541b3deb6f", "01adcd8bf15fbf70df47fbf3a953aa14",
+ "ba539808a8501609ce052a1562a62b25", "ac8e6391200cec2abdebb00744a2ba82",
+ "54b17120f7d71ddb4d70590ecd231cc1", "f6e36446a97611a4db4425df926974b2",
+ "a82f4080699300b659bbe1b5c4463147", "ecedb178f7cad3dc1b921eca67f9efb6",
+ "0609ca0ff3ca90069e8b48829b4b0891", "839e86c681e97359f7819c766000dd1c",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest10bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+ "27e21eb31687f9fbd0a66865fa8d7c8a", "9bff726c8e1d0998451a3b9cf2b3d8c8",
+ "661d74cfef36f12ed8d9b4c3ccb7fe0d", "5fc365fd1fcc9599dd97a885ba0c2eec",
+ "acdba2c82a6268e3c0ae8fc32be1b41f", "a5db60bbeaf56ab030ed21c42d553cf3",
+ "1228bb633f9fd63fdb998b775ca79e98", "07812c97f9f43a2a8ae07329dc488699",
+ "903525fb782119c4dfaf61b98a310c9f", "f38b51cef38b929e317861ccbc73ecd8",
+ "b78b05138e1d5fbf089144c42ce03058", "f2e227664cbf2d821b242a34fcbc9835",
+ "cb992dac70591e7d3663588ae13b9adc", "f2292d33657d939fa85ea5bacdfe39a3",
+ "7049dc742d6d8ad6f5d4309968ff281c", "e4beebde1ac335a4d92e4af94653a2ce",
+ "cc77875f98f54b9b26b5f7d9fcbc828d", "fb623f7b9e1ffcf2ae361599728a5589",
+ "c33847e47a7eda214734084640818df9", "ab3e1aec3d720c0c89c46a8d5b161b44",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "efe4de861dcf0f7458b6208cae7e3584",
+ "814751c55fa84f0fed94ff15fc30fc24", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "31a63fe47297102937acbe7a328588b7", "b804a0a24633243f7da48d7a5f51c0bf",
+ "cb492672b005fc378cccc8c03003cd4a", "1d18732bcf2ea487e84579489cc59a22",
+ "457c4b3ec38a8d6c210584ade1a9fae2", "a3afdd468e6a5238a3dbd2cc21c11c9e",
+ "6ff8a16f21d6e8a9741dacf0734ae563", "3ffa29ef7e54e51f6849c9a3d3c79d03",
+ "af89899b083cf269ac1bd988aeb15b15", "3365d8411c11081fb228436238b9a671",
+ "3ba56d30f5f81d7098f356635a58b9af", "b3013776900c6520bd30f868e8c963b6",
+ "81febaa7342692483040f500ba2e5e2b", "4a51ff1d9a4a68687d590b41aa7835a3",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetConvolveDigest12bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+ "e25031afae184cc4d186cde7e3d51e33", "6fb55dec2506dae6c229469cdf2e7d83",
+ "9df34d27f5bd040d1ed1455b151cd1ff", "7f6829458f00edb88f78851dd1a08739",
+ "a8bbe9b6b9eaf6f681d91c981b994949", "21f74980b36cb246426f4bc3fe7c08c3",
+ "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+ "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+ "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+ "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+ "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+ "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+ "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+ "291e67095399651dc5c8a033390f255f", "66b26828e434faf37ddc57d3e0abb6db",
+ "e9cd69e9ba70864e3d0b175ac0a177d6", "64e4db895a843cb05384f5997b1ba978",
+ "f305161c82de999d2c93eac65f609cfe", "4762b2bd27983ad916ec0a930c0eca6b",
+ "1631495ffae43a927267ebd476015ef1", "b0f22de7b10057e07af71f9bce4615ce",
+ "6fa29dc4be1a46d246a41d66a3d35cb4", "734601c2185bdf30ba9ded8b07003a05",
+ "524e4553d92c69e7e4ed934f7b806c6b", "3709c8950bc5fcc4a2b3ec68fc78bf7e",
+ "69c274d9f8e0fd6790495e9695251f1f", "ee30cc1232c27494ef53edd383568f25",
+ "e525dbeb0b4341952a92270dcfc51730", "b3685c9e783d3402497bbd49d28c7dd7",
+ "d1c9f02dc818e6b974794dfb7749aac8", "bdb9e4961f9aa8c25568d3394e968518",
+ "f5f74555adcad85f3ebd3cb85dc7b770", "737e2a0be806dbd701014f2078be7898",
+ "20a18294e3a9422193aa0a219fd80ede", "7106648ecb9ae24a54d1dbabf2a9e318",
+ "20f39cbd6b5ed87a6ae4f818932325c0", "a99666e3157e32a07c87b01e52091a76",
+ "123e4d533d478c3089c975323c85396b", "d2a8021f7683a0cdf2658418fa90a6fc",
+ "1437e192a3349db8702d5b90eb88dbc1", "fe097fc4aeed7cda0b0f405124efb19d",
+ "1988227c51fa589db1307fd890bb5972", "537e25a6c30b240dc1e3bddd1c3a0a03",
+ "aebe5cffaae448db5a08987a3375a428", "7127ae9bdc63df4459590dc02ca95403",
+ "7ad281903a210f2b1f39f7c40c8df272", "d4b97ba21f7b400ba9f9cd8bb1a576df",
+ "0884a824203aaf72c78f73fdaf2b23a2", "63d60388605c92daee55d517de622a9e",
+ "171ec49a779de1efa69510eefbd09bba", "541cf051579c5a10b9debd3bfdcb7f32",
+ "91c14451ad93ed88e96b5d639ce408de", "3b0313ec0e043d19744bf88c90e875a1",
+ "6adcb3cee91fe3a83b36deb11c5ad6dd", "c6d4bfad24616a88222681992a99d782",
+ "515dc0f2a41730d5c434e4f3c81b02c3", "1c69abdee3b9608a6094034badc2bec0",
+ "1785a0f321d7dd90aa8846961737a767", "dd12c5b8c341f2423d0d5db4f285d199",
+ "5741fb69aae1ca8a0fbe4f1478df88ef", "a4390ceb4e4e9f5cf6a47a9b11a97015",
+ "6778eb25df902092b440c3402e7f0f80", "5ad9d6b36f8898bb55e901c1c0c523da",
+ "73969b6c03bb5a7345a8b968b542668e", "f48192947e66d70f116193a4186d0186",
+ "53f60d0e89d7d994ec6d6131fb7e75ae", "c75f6f8813839ae3cf192baa29039265",
+ "9ff0852ebbad56663250f86ac3a3bf9b", "668938580a770ea7ace8bbf7d349e89f",
+ "5b06bb0a15ac465a250d9b209f05289f", "a2128f5c8692fed7e7c1c7af22ce9f72",
+ "f80f1d7a58869ec794258c0f7df14620", "ed1e03a35924c92ed2fc9808dc3f06f3",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "35ef89c35d2e8e46feb856c554c21c9f",
+ "b98ce33a1bf4fab840b7ef261b30dbc4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "b98ce33a1bf4fab840b7ef261b30dbc4", "42263fb359c4fdf1c7cdb4980b3e97f2",
+ "1e7071b7db3144188bdcf5d199fe5355", "1e7071b7db3144188bdcf5d199fe5355",
+ "30d367304a87bd25f0ad2ff8e4b5eb41", "4abe6dbb3198219015838dbedf07297a",
+ "4abe6dbb3198219015838dbedf07297a", "acec349a95b5bba98bb830372fa15e73",
+ "a73ad8661256ce2fdf5110425eb260b2", "a73ad8661256ce2fdf5110425eb260b2",
+ "8ff2f049d3f972867f14775188fe589b", "87f5f9a07aea75c325e6d7ff6c96c7c2",
+ "87f5f9a07aea75c325e6d7ff6c96c7c2", "325fcde7d415d7aa4929a3ea013fb9cc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "05aa29814d5ce35389dbcf20368850da",
+ "fbb89f907a040e70953e3364dbe1feda", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "44ac511baf45032078cc0b45e41dba79", "efb98974adc58d88e122293436bb9184",
+ "7eee18c1a16bcb4e7ef7b27f68ba884f", "b0904c9b118dd9a1f9f034c0ff82d1c1",
+ "54436deb5183dd9669dd4f5feadb3850", "4db1c310b7d9a8bd3e2b5d20fa820e3b",
+ "c40abc6b2d67527f48a287cd7e157428", "48ec3fcf509805f484c8e0948c3469be",
+ "cb7d4a76fa7de52ed2fe889785327b38", "f57983346815fa41e969c195c1c03774",
+ "7dba59b0de2c877666ded6bdaefdcc30", "4837f8ba2f67f17f28a38c5e2a434c73",
+ "09e06fe9dc7ef7818f2a96895235afd4", "002976970ec62b360f956b9c091782d4",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "78673b1084367e97b8dd83990adc5219",
+ "06b5d4a30b9efb6c1d95ef7957f49e76", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "ce460146922cd53970b510d460aa4062", "6fd051938b8efcec9ece042f1edc177a",
+ "f5ff0dcfe3c1a56e3856549d8ded416b", "b69bc2cfc17c6b4313264db96831f0d1",
+ "38a5e65bd71934becfb376eb3b9bc513", "32c1163aa4ca6b6c69d950aba7b06d48",
+ "0c22a6c014c6347983de4ca863f3b53f", "a80c5ee9eb2dfb9a0d56e92eb3f85d91",
+ "a9719722a150a81175427bc161b95d7a", "8237befd456131a488cc5b8b63f4aca5",
+ "51616abcd0beea53a78ffce106b974fc", "6c47b22270f01d27b404da07e1be1202",
+ "356268298d3887edaabd0169a912c94e", "d2b00216e106cb8c5450e2eff1f8481a",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "c2de3a582c79aee811076211c497d2bc",
+ "d1b6d9c73da41def26dd4f85fbd1bde8", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "d8374eb7825081b89f74b05c66bccd63", "d5f7d68c10b5eaf0fba6f93ee26266e6",
+ "94d19cb65f29db65e6656b588f431ade", "5126e95f0249024a6f6d426714bd5b20",
+ "d7d3654b9c2dabe13239875984770acd", "6491afd5d651aab80aa179b579b74341",
+ "037a5de0de89983808f8e8f6dc39110f", "5980073b7685c5c9b2ec027e06be2cbc",
+ "0abb9d035aca426b62ca0f3fab063bab", "fe002a902bb4ec24dfe3ea0fe381a472",
+ "1ac15726df1aa2cd8855162a91893379", "0758c3ac16467605d73c725a697c3dc1",
+ "97d894d85f6ccfa4ff81e0e8fdf03da1", "c3c7b362f063a18244ea542a42d2873c",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7f6829458f00edb88f78851dd1a08739",
+ "a8bbe9b6b9eaf6f681d91c981b994949", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+ "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+ "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+ "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+ "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+ "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+ "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "67b2ea94cc4d0b32db3ae3c29eee4d46",
+ "bcfec99ad75988fa1efc1733204f17f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "79c222c5796e50119f1921e7bc534a25", "ae3f7c189458f044e9c52399d37a55e2",
+ "fd6dde45bd2937331428de9ef4f8e869", "b384d065423f3d271b85781d76a73218",
+ "466ea0680c06f59e8b3bb293608731fb", "360541ba94f42d115fe687a97a457ffb",
+ "e5a0794d37af40c40a4d2c6d3f7d2aa2", "4eed285651a75614bd60adebbe2e185c",
+ "bbdbf93942282d7b9c4f07591a1764a6", "1288a9ec3e6f79213b6745e6e7568c44",
+ "4ff1310bfd656d69ed5c108a91a9b01a", "3380806b5f67eb3ebce42f8e7c05b256",
+ "09c4bdf0f30aca6812fb55a5ac06b1bd", "722c86ba6bf21f40742ee33b4edc17c4",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "f5303c96d1630f9840eaaba058cd818b",
+ "c20cd45782b2f52c05e4189912047570", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "d6360f96fe15a3ee1e903b0a53dcaaeb", "4b18995cdf2f5d18410d3a732c5932b1",
+ "6f62bf7395c3dfccc1565ba8424f20e8", "c9987ed30491cd28bbc711dd57228247",
+ "8e277ec837cbecf529ae2eb0578fddc1", "c0c132386f23c5f0fba055a12fb79547",
+ "6b5617ab78dd0916690dfa358298b7b3", "394abedca37f60d1a5148a4c975305ed",
+ "bb88881e0e4cf2d88c2d2b38b5833f20", "bef10806be8d58ea8e97870a813b075e",
+ "b4b017d1f792bea69d3b773db7c80c7c", "0660bc63041213a8a4d74724a3bc4291",
+ "5050c8c5388a561691fd414b00c041df", "9ed40c68de6a8008a902d7224f8b620f",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "ec10ce4a674424478a401847f744251d",
+ "bdd897eafc8ef2651a7bba5e523a6ac2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "2745de4a6b29abb85ee513e22ad362c3", "8aaad384b7cd349b4b968e657ec15523",
+ "fb6c0723432bcd2246d51a90f5fb5826", "f8104ed5921ebd48c6eed16150ffe028",
+ "85c2e236b3e32bf731601237cf0594cd", "8bd6eefff9640766cdf64ab082cb1485",
+ "78b5cd9dde6c6a5900f3040c57172091", "aaa980506bd7bb1d75924a8025698d1a",
+ "90050a411d501f7166f6741832b0c342", "d6ec88b2c38e32511f3359e1d5f9d85b",
+ "96506b8b39274c8fe687ea39761997f1", "3322ea83995c2762fb60db993b401658",
+ "151b6e4ce60392639982fca5a73ac3d3", "d52a1038e135bef233674a843f8c7cb6",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest12bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+ "aea59b7a638f27acad2b90fd2b8c9fee", "be87ba981a0af25611a7d5f0970be9b3",
+ "7c81f1486cd607376d776bf2c6e81dec", "f683ba2a9b353bea35f26c1ed730f3c5",
+ "11e2d70daff1726093cb4fcae33ce0d6", "567575eac0dea2f379019b2d4bafe444",
+ "216479ed580d6e0d7c1d523015394814", "dcabbe5f5709a4b6634d77cc514e863a",
+ "4e888207fe917faeea2b44383ac16caf", "d617c5608fae3b01c507c7e88040fee3",
+ "eeac5d9b3dc005e76f13dfc7483eae48", "8ff0a82811f77303c4516bb8c761336f",
+ "95a7c315aaa208097b6ab006f1d07654", "da63527ee80e6772435cff8321a29a95",
+ "404457f72e7113d1f3797a39319fd3fe", "43cbccfe2663ec11c157319acfe629a5",
+ "1dc5b8dee4542f3d7fcf6b0fa325dfde", "16d4506674f2fcedfcd1e006eb097141",
+ "4fcf329ddb405cd6bbb0a6fb87e29eb3", "de77a781957653ea1750f79995605cdc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "436f6fdc008d94a94bc6f516f98f402f",
+ "b436bd9036f08ba7e50cfc536911dbbd", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "720a01018856bd83f4d89a9024b14728", "b7e01a3f161007712ce342f59b2c51f2",
+ "922420ebe5dec4f19c259ebdf8a3259a", "979aaba579556207a7bbcc939123c1b2",
+ "89a30898cbaa4d64f9072173e8365864", "0586ff961f2e4228f4e38299fb25ae07",
+ "adb27a03f8b1b50fe2a52b5ca8d4fc28", "4f91cd92aab2e09f4b123251a8d2f219",
+ "620fba0fff163d96a1cd663d1af4a4c5", "bf7a0fa65b1a90ba34c834558fa2c86e",
+ "c21f7d7d16d047a27b871a7bf8476e2d", "a94b17c81f3ce2b47081bd8dd762a2e5",
+ "9078d20f59bc24862af3856acb8c0357", "ee510ce6b3d22de9e4bd7920a26fd69a",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct ConvolveTypeParam {
+ ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
+ bool has_vertical_filter, bool has_horizontal_filter)
+ : is_intra_block_copy(is_intra_block_copy),
+ is_compound(is_compound),
+ has_vertical_filter(has_vertical_filter),
+ has_horizontal_filter(has_horizontal_filter) {}
+ bool is_intra_block_copy;
+ bool is_compound;
+ bool has_vertical_filter;
+ bool has_horizontal_filter;
+};
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) {
+ return os << "is_intra_block_copy: " << param.is_intra_block_copy
+ << ", is_compound: " << param.is_compound
+ << ", has_(vertical/horizontal)_filter: "
+ << param.has_vertical_filter << "/" << param.has_horizontal_filter;
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveTest : public testing::TestWithParam<
+ std::tuple<ConvolveTypeParam, ConvolveTestParam>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ ConvolveTest() = default;
+ ~ConvolveTest() override = default;
+
+ void SetUp() override {
+ ConvolveInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ GetConvolveFunc(dsp, &base_convolve_func_);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_convolve_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ConvolveInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ ConvolveInit_AVX2();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ ConvolveInit10bpp_NEON();
+#endif
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ GetConvolveFunc(dsp, &cur_convolve_func_);
+
+ // Skip functions that have not been specialized for this particular
+ // architecture.
+ if (cur_convolve_func_ == base_convolve_func_) {
+ cur_convolve_func_ = nullptr;
+ }
+ }
+
+ protected:
+ int GetDigestId() const {
+ int id = param_.block_size;
+ id += param_.kNumBlockSizes *
+ static_cast<int>(type_param_.has_horizontal_filter);
+ id += 2 * param_.kNumBlockSizes *
+ static_cast<int>(type_param_.has_vertical_filter);
+ id += 4 * param_.kNumBlockSizes * static_cast<int>(type_param_.is_compound);
+ id += 8 * param_.kNumBlockSizes *
+ static_cast<int>(type_param_.is_intra_block_copy);
+ return id;
+ }
+
+ void GetConvolveFunc(const Dsp* dsp, ConvolveFunc* func);
+ void SetInputData(bool use_fixed_values, int value);
+ void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+ libvpx_test::MD5* md5_digest);
+ void Check16Bit(bool use_fixed_values, const uint16_t* src,
+ const uint16_t* dest, libvpx_test::MD5* md5_digest);
+ // |num_runs| covers the categories of filters (6) and the number of filters
+ // under each category (16).
+ void Test(bool use_fixed_values, int value,
+ int num_runs = kMinimumViableRuns);
+
+ const ConvolveTypeParam type_param_ = std::get<0>(GetParam());
+ const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+ ConvolveFunc base_convolve_func_;
+ ConvolveFunc cur_convolve_func_;
+ // Convolve filters are 7-tap, which need 3 pixels
+ // (kRestorationHorizontalBorder) padding.
+ Pixel source_[kMaxBlockHeight * kMaxBlockWidth] = {};
+ uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+ uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+ Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+ const int source_stride_ = kMaxBlockWidth;
+ const int source_height_ = kMaxBlockHeight;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::GetConvolveFunc(const Dsp* const dsp,
+ ConvolveFunc* func) {
+ *func =
+ dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound]
+ [type_param_.has_vertical_filter]
+ [type_param_.has_horizontal_filter];
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+ int value) {
+ if (use_fixed_values) {
+ std::fill(source_, source_ + source_height_ * source_stride_, value);
+ } else {
+ const int offset =
+ kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int height = param_.height;
+ const int width = param_.width;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+ }
+ }
+ // Copy border pixels to the left and right borders.
+ for (int y = 0; y < height; ++y) {
+ Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+ source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+ Memset(&source_[y * source_stride_ + offset + width],
+ source_[y * source_stride_ + offset + width - 1],
+ kConvolveBorderLeftTop);
+ }
+ // Copy border pixels to the top and bottom borders.
+ for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+ memcpy(&source_[y * source_stride_],
+ &source_[kConvolveBorderLeftTop * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+ &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+ const Pixel* src, const Pixel* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values,
+ const uint16_t* src,
+ const uint16_t* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Test(
+ bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+ // There's no meaning testing fixed input in compound convolve.
+ if (type_param_.is_compound && use_fixed_values) return;
+
+ // There should not be any function set for this combination.
+ if (type_param_.is_intra_block_copy && type_param_.is_compound) {
+ ASSERT_EQ(cur_convolve_func_, nullptr);
+ return;
+ }
+
+ // Compound and intra block copy functions are only used for blocks 4x4 or
+ // greater.
+ if (type_param_.is_compound || type_param_.is_intra_block_copy) {
+ if (param_.width < 4 || param_.height < 4) {
+ GTEST_SKIP();
+ }
+ }
+
+ // Skip unspecialized functions.
+ if (cur_convolve_func_ == nullptr) {
+ GTEST_SKIP();
+ }
+
+ SetInputData(use_fixed_values, value);
+ int subpixel_x = 0;
+ int subpixel_y = 0;
+ int vertical_index = 0;
+ int horizontal_index = 0;
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const Pixel* const src = source_ + offset;
+ const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+ const ptrdiff_t src_stride_16 = source_stride_;
+ const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+ // Pack Compound output since we control the predictor buffer.
+ const ptrdiff_t dst_stride_compound = param_.width;
+
+ // Output is always 16 bits regardless of |bitdepth|.
+ uint16_t* dst_16 = dest_16bit_ + offset;
+ // Output depends on |bitdepth|.
+ Pixel* dst_pixel = dest_clipped_ + offset;
+
+ // Collect the first |kMinimumViableRuns| into one md5 buffer.
+ libvpx_test::MD5 md5_digest;
+
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ // Test every filter.
+ // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+ subpixel_x += 1 << 6;
+ subpixel_y += 1 << 6;
+
+ const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+ const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+ // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+ // function.
+ if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+ continue;
+ }
+
+ // For focused speed testing these can be set to the desired filter. Want
+ // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+ vertical_index += static_cast<int>(i % 16 == 0);
+ vertical_index %= 4;
+ horizontal_index += static_cast<int>(i % 16 == 0);
+ horizontal_index %= 4;
+
+ if (type_param_.is_compound) {
+ // Output type is uint16_t.
+ const absl::Time start = absl::Now();
+ cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+ horizontal_filter_id, vertical_filter_id, param_.width,
+ param_.height, dst_16, dst_stride_compound);
+ elapsed_time += absl::Now() - start;
+ } else {
+ // Output type is Pixel.
+ const absl::Time start = absl::Now();
+ cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+ horizontal_filter_id, vertical_filter_id, param_.width,
+ param_.height, dst_pixel, dst_stride);
+ elapsed_time += absl::Now() - start;
+ }
+
+ // Only check the output for the first set. After that it's just repeated
+ // runs for speed timing.
+ if (i >= kMinimumViableRuns) continue;
+
+ if (type_param_.is_compound) {
+ // Need to copy source to a uint16_t buffer for comparison.
+ Pixel* src_ptr = source_;
+ uint16_t* src_ptr_16 = source_16bit_;
+ for (int y = 0; y < kMaxBlockHeight; ++y) {
+ for (int x = 0; x < kMaxBlockWidth; ++x) {
+ src_ptr_16[x] = src_ptr[x];
+ }
+ src_ptr += src_stride_16;
+ src_ptr_16 += src_stride_16;
+ }
+
+ Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest);
+ } else {
+ Check(use_fixed_values, src, dst_pixel, &md5_digest);
+ }
+ }
+
+ if (!use_fixed_values) {
+ // md5 sums are only calculated for random input.
+ const char* ref_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ ref_digest = GetConvolveDigest8bpp(GetDigestId());
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ ref_digest = GetConvolveDigest10bpp(GetDigestId());
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ ref_digest = GetConvolveDigest12bpp(GetDigestId());
+ break;
+#endif
+ }
+ ASSERT_NE(ref_digest, nullptr);
+
+ const char* direction;
+ if (type_param_.has_vertical_filter && type_param_.has_horizontal_filter) {
+ direction = "2D";
+ } else if (type_param_.has_vertical_filter) {
+ direction = "Vertical";
+ } else if (type_param_.has_horizontal_filter) {
+ direction = "Horizontal";
+ } else {
+ direction = "Copy";
+ }
+ const auto elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+ printf("Mode Convolve%s%s%s[%25s]: %5d us MD5: %s\n",
+ type_param_.is_compound ? "Compound" : "",
+ type_param_.is_intra_block_copy ? "IntraBlockCopy" : "", direction,
+ absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+ elapsed_time_us, md5_digest.Get());
+ EXPECT_STREQ(ref_digest, md5_digest.Get());
+ }
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ int min = 0, max = 0;
+ for (int i = 0; i < kSubPixelTaps; ++i) {
+ const int tap = filter[i];
+ if (tap > 0) {
+ max += max_input * tap;
+ min += min_input * tap;
+ } else {
+ min += max_input * tap;
+ max += min_input * tap;
+ }
+ }
+ *min_output = min;
+ *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Convolve process.
+template <int bitdepth>
+void ShowRange() {
+ // Subtract one from the shift bits because the filter is pre-shifted by 1.
+ constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsHorizontal12bpp - 1
+ : kInterRoundBitsHorizontal - 1;
+ constexpr int vertical_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsVertical12bpp - 1
+ : kInterRoundBitsVertical - 1;
+ constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1;
+
+ constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+ constexpr int max_input = (1 << bitdepth) - 1;
+
+ const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8];
+
+ // First pass.
+ printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0,
+ max_input);
+
+ int min, max;
+ ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max);
+
+ if (bitdepth == 8) {
+ // 8bpp can use int16_t for sums.
+ assert(min > INT16_MIN);
+ assert(max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" Horizontal upscaled range: [%8d, %8d]\n", min, max);
+
+ const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+ const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+ // All bitdepths can use int16_t for first pass output.
+ assert(first_pass_min > INT16_MIN);
+ assert(first_pass_max < INT16_MAX);
+
+ printf(" Horizontal downscaled range: [%8d, %8d]\n", first_pass_min,
+ first_pass_max);
+
+ // Second pass.
+ ApplyFilterToSignedInput(first_pass_min, first_pass_max,
+ worst_convolve_filter, &min, &max);
+
+ // All bitdepths require int32_t for second pass sums.
+ assert(min < INT16_MIN && min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+
+ printf(" Vertical upscaled range: [%8d, %8d]\n", min, max);
+
+ // Second pass non-compound output is clipped to Pixel values.
+ const int second_pass_min =
+ Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+ const int second_pass_max =
+ Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+ printf(" Pixel output range: [%8d, %8d]\n", second_pass_min,
+ second_pass_max);
+
+ // Output is Pixel so matches Pixel values.
+ assert(second_pass_min == 0);
+ assert(second_pass_max == max_input);
+
+ const int compound_second_pass_min =
+ RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+ const int compound_second_pass_max =
+ RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+ printf(" Compound output range: [%8d, %8d]\n",
+ compound_second_pass_min, compound_second_pass_max);
+
+ if (bitdepth == 8) {
+ // 8bpp output is int16_t without an offset.
+ assert(compound_second_pass_min > INT16_MIN);
+ assert(compound_second_pass_max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp use the offset to fit inside uint16_t.
+ assert(compound_second_pass_min > 0);
+ assert(compound_second_pass_max < UINT16_MAX);
+ }
+
+ printf("\n");
+}
+
+TEST(ConvolveTest, ShowRange) {
+ ShowRange<kBitdepth8>();
+ ShowRange<kBitdepth10>();
+ ShowRange<kBitdepth12>();
+}
+
+using ConvolveTest8bpp = ConvolveTest<8, uint8_t>;
+
+TEST_P(ConvolveTest8bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, 255);
+}
+
+TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest8bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveScaleTest
+ : public testing::TestWithParam<
+ std::tuple<bool /*is_compound*/, ConvolveTestParam>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ ConvolveScaleTest() = default;
+ ~ConvolveScaleTest() override = default;
+
+ void SetUp() override {
+ ConvolveInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_convolve_scale_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ConvolveInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ ConvolveInit_AVX2();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ ConvolveInit10bpp_NEON();
+#endif
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+ // Skip functions that have not been specialized for this particular
+ // architecture.
+ if (cur_convolve_scale_func_ == base_convolve_scale_func_) {
+ cur_convolve_scale_func_ = nullptr;
+ }
+ }
+
+ protected:
+ int GetDigestId() const {
+ return param_.block_size +
+ param_.kNumBlockSizes * static_cast<int>(is_compound_);
+ }
+
+ void SetInputData(bool use_fixed_values, int value);
+ void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+ libvpx_test::MD5* md5_digest);
+ void Check16Bit(bool use_fixed_values, const uint16_t* src,
+ const uint16_t* dest, libvpx_test::MD5* md5_digest);
+ // |num_runs| covers the categories of filters (6) and the number of filters
+ // under each category (16).
+ void Test(bool use_fixed_values, int value,
+ int num_runs = kMinimumViableRuns);
+
+ const bool is_compound_ = std::get<0>(GetParam());
+ const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+ ConvolveScaleFunc base_convolve_scale_func_;
+ ConvolveScaleFunc cur_convolve_scale_func_;
+ // Convolve filters are 7-tap, which need 3 pixels
+ // (kRestorationHorizontalBorder) padding.
+ // The source can be at most 2 times of max width/height.
+ Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+ uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+ uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+ Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+ const int source_stride_ = kMaxBlockWidth * 2;
+ const int source_height_ = kMaxBlockHeight * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+ int value) {
+ if (use_fixed_values) {
+ std::fill(source_, source_ + source_height_ * source_stride_, value);
+ } else {
+ const int offset =
+ kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int height = param_.height * 2;
+ const int width = param_.width * 2;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+ }
+ }
+ // Copy border pixels to the left and right borders.
+ for (int y = 0; y < height; ++y) {
+ Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+ source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+ Memset(&source_[y * source_stride_ + offset + width],
+ source_[y * source_stride_ + offset + width - 1],
+ kConvolveBorderLeftTop);
+ }
+ // Copy border pixels to the top and bottom borders.
+ for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+ memcpy(&source_[y * source_stride_],
+ &source_[kConvolveBorderLeftTop * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+ &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+ const Pixel* src,
+ const Pixel* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check16Bit(
+ bool use_fixed_values, const uint16_t* src, const uint16_t* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Test(
+ bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+ // There's no meaning testing fixed input in compound convolve.
+ if (is_compound_ && use_fixed_values) return;
+
+ // The compound function is only used for blocks 4x4 or greater.
+ if (is_compound_) {
+ if (param_.width < 4 || param_.height < 4) {
+ GTEST_SKIP();
+ }
+ }
+
+ // Skip unspecialized functions.
+ if (cur_convolve_scale_func_ == nullptr) {
+ GTEST_SKIP();
+ }
+
+ SetInputData(use_fixed_values, value);
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ GetDigestId());
+ // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x.
+ const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+ const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+ int subpixel_x = 0;
+ int subpixel_y = 0;
+ int vertical_index = 0;
+ int horizontal_index = 0;
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const int offset_scale =
+ kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+ const Pixel* const src_scale = source_ + offset_scale;
+ const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+ const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+ // Pack Compound output since we control the predictor buffer.
+ const ptrdiff_t dst_stride_compound = param_.width;
+
+ // Output is always 16 bits regardless of |bitdepth|.
+ uint16_t* dst_16 = dest_16bit_ + offset;
+ // Output depends on |bitdepth|.
+ Pixel* dst_pixel = dest_clipped_ + offset;
+
+ // Collect the first |kMinimumViableRuns| into one md5 buffer.
+ libvpx_test::MD5 md5_digest;
+
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ // Test every filter.
+ // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+ subpixel_x += 1 << 6;
+ subpixel_y += 1 << 6;
+
+ const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+ const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+ // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+ // function.
+ if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+ continue;
+ }
+
+ // For focused speed testing these can be set to the desired filter. Want
+ // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+ vertical_index += static_cast<int>(i % 16 == 0);
+ vertical_index %= 4;
+ horizontal_index += static_cast<int>(i % 16 == 0);
+ horizontal_index %= 4;
+
+ // Output type is uint16_t.
+ const absl::Time start = absl::Now();
+ if (is_compound_) {
+ cur_convolve_scale_func_(
+ source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+ step_y, param_.width, param_.height, dst_16, dst_stride_compound);
+ } else {
+ cur_convolve_scale_func_(
+ source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+ step_y, param_.width, param_.height, dst_pixel, dst_stride);
+ }
+ elapsed_time += absl::Now() - start;
+
+ // Only check the output for the first set. After that it's just repeated
+ // runs for speed timing.
+ if (i >= kMinimumViableRuns) continue;
+
+ // Convolve function does not clip the output. The clipping is applied
+ // later, but libaom clips the output. So we apply clipping to match
+ // libaom in tests.
+ if (is_compound_) {
+ const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1));
+ Pixel* dest_row = dest_clipped_;
+ for (int y = 0; y < kMaxBlockHeight; ++y) {
+ for (int x = 0; x < kMaxBlockWidth; ++x) {
+ dest_row[x] = static_cast<Pixel>(Clip3(
+ dest_16bit_[y * dst_stride_compound + x] - single_round_offset, 0,
+ (1 << bitdepth) - 1));
+ }
+ dest_row += kMaxBlockWidth;
+ }
+ }
+
+ if (is_compound_) {
+ Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16,
+ &md5_digest);
+ } else {
+ Check(use_fixed_values, src_scale, dst_pixel, &md5_digest);
+ }
+ }
+
+ if (!use_fixed_values) {
+ // md5 sums are only calculated for random input.
+ const char* ref_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ ref_digest = GetConvolveScaleDigest8bpp(GetDigestId());
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ ref_digest = GetConvolveScaleDigest10bpp(GetDigestId());
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ ref_digest = GetConvolveScaleDigest12bpp(GetDigestId());
+ break;
+#endif
+ }
+ ASSERT_NE(ref_digest, nullptr);
+
+ const auto elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+ printf("Mode Convolve%sScale2D[%25s]: %5d us MD5: %s\n",
+ is_compound_ ? "Compound" : "",
+ absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+ elapsed_time_us, md5_digest.Get());
+ EXPECT_STREQ(ref_digest, md5_digest.Get());
+ }
+}
+
+using ConvolveScaleTest8bpp = ConvolveScaleTest<8, uint8_t>;
+
+TEST_P(ConvolveScaleTest8bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, 255);
+}
+
+TEST_P(ConvolveScaleTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest8bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+const ConvolveTestParam kConvolveParam[] = {
+ ConvolveTestParam(ConvolveTestParam::kBlockSize2x2),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize2x4),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize4x2),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize4x4),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize4x8),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize8x2),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize8x4),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize8x8),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize8x16),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize16x8),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize16x16),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize16x32),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize32x16),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize32x32),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize32x64),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize64x32),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize64x64),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize64x128),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize128x64),
+ ConvolveTestParam(ConvolveTestParam::kBlockSize128x128),
+};
+
+const ConvolveTypeParam kConvolveTypeParam[] = {
+ ConvolveTypeParam(false, false, false, false),
+ ConvolveTypeParam(false, false, false, true),
+ ConvolveTypeParam(false, false, true, false),
+ ConvolveTypeParam(false, false, true, true),
+ ConvolveTypeParam(false, true, false, false),
+ ConvolveTypeParam(false, true, false, true),
+ ConvolveTypeParam(false, true, true, false),
+ ConvolveTypeParam(false, true, true, true),
+ ConvolveTypeParam(true, false, false, false),
+ ConvolveTypeParam(true, false, false, true),
+ ConvolveTypeParam(true, false, true, false),
+ ConvolveTypeParam(true, false, true, true),
+ // This is left to ensure no function exists for |intra_block_copy| when
+ // |is_compound| is true; all combinations aren't necessary.
+ ConvolveTypeParam(true, true, false, false),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest8bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest8bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveScaleTest8bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveScaleTest8bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConvolveTest10bpp = ConvolveTest<10, uint16_t>;
+
+TEST_P(ConvolveTest10bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest10bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest10bpp = ConvolveScaleTest<10, uint16_t>;
+
+TEST_P(ConvolveScaleTest10bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveScaleTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest10bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest10bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest10bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest10bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConvolveTest12bpp = ConvolveTest<12, uint16_t>;
+
+TEST_P(ConvolveTest12bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest12bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest12bpp = ConvolveScaleTest<12, uint16_t>;
+
+TEST_P(ConvolveScaleTest12bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveScaleTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest12bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest12bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ // See warp.cc and convolve.cc for detailed prediction ranges.
+ // weight_0 + weight_1 = 16.
+ int res = pred_0[x] * weight_0 + pred_1[x] * weight_1;
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0,
+ (1 << bitdepth) - 1));
+ } while (++x < width);
+
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void DistanceWeightedBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/distance_weighted_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/distance_weighted_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kQuantizedDistanceLookup[4][2] = {
+ {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+template <int bitdepth, typename Pixel>
+class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ DistanceWeightedBlendTest() = default;
+ ~DistanceWeightedBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ DistanceWeightedBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_func_ = dsp->distance_weighted_blend;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ DistanceWeightedBlendInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ DistanceWeightedBlendInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->distance_weighted_blend;
+ }
+
+ protected:
+ void Test(const char* digest, int num_tests);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+ const int width_ = kBlockWidthPixels[GetParam()];
+ const int height_ = kBlockHeightPixels[GetParam()];
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+ {};
+ dsp::DistanceWeightedBlendFunc base_func_;
+ dsp::DistanceWeightedBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
+ int num_tests) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ PredType* src_1 = source1_;
+ PredType* src_2 = source2_;
+
+ const int index = rnd.Rand8() & 3;
+ const uint8_t weight_0 = kQuantizedDistanceLookup[index][0];
+ const uint8_t weight_1 = kQuantizedDistanceLookup[index][1];
+ // In libgav1, predictors have an offset which are later subtracted and
+ // clipped in distance weighted blending. Therefore we add the offset
+ // here to match libaom's implementation.
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ // distance_weighted_blend is applied to compound prediction values. This
+ // implies a range far exceeding that of pixel values.
+ // The ranges include kCompoundOffset in 10bpp and 12bpp.
+ // see: src/dsp/convolve.cc & src/dsp/warp.cc.
+ static constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+ };
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ src_1 += width_;
+ src_2 += width_;
+ }
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_tests; ++i) {
+ const absl::Time start = absl::Now();
+ func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_,
+ sizeof(Pixel) * kDestStride);
+ elapsed_time += absl::Now() - start;
+ }
+
+ test_utils::CheckMd5Digest("DistanceWeightedBlend", ToString(GetParam()),
+ digest, dest_, sizeof(dest_), elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+ kBlock4x4, kBlock4x8, kBlock4x16, kBlock8x4, kBlock8x8,
+ kBlock8x16, kBlock8x32, kBlock16x4, kBlock16x8, kBlock16x16,
+ kBlock16x32, kBlock16x64, kBlock32x8, kBlock32x16, kBlock32x32,
+ kBlock32x64, kBlock64x16, kBlock64x32, kBlock64x64, kBlock64x128,
+ kBlock128x64, kBlock128x128,
+};
+
+const char* GetDistanceWeightedBlendDigest8bpp(const BlockSize block_size) {
+ static const char* const kDigests[kMaxBlockSizes] = {
+ // 4xN
+ "ebf389f724f8ab46a2cac895e4e073ca",
+ "09acd567b6b12c8cf8eb51d8b86eb4bf",
+ "57bb4d65695d8ec6752f2bd8686b64fd",
+ // 8xN
+ "270905ac76f9a2cba8a552eb0bf7c8c1",
+ "f0801c8574d2c271ef2bbea77a1d7352",
+ "e761b580e3312be33a227492a233ce72",
+ "ff214dab1a7e98e2285961d6421720c6",
+ // 16xN
+ "4f712609a36e817f9752326d58562ff8",
+ "14243f5c5f7c7104160c1f2cef0a0fbc",
+ "3ac3f3161b7c8dd8436b02abfdde104a",
+ "81a00b704e0e41a5dbe6436ac70c098d",
+ "af8fd02017c7acdff788be742d700baa",
+ // 32xN
+ "ee34332c66a6d6ed8ce64031aafe776c",
+ "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+ "607ffc22098d81b7e37a7bf62f4af5d3",
+ "3823dbf043b4682f56d5ca698e755ea5",
+ // 64xN
+ "4acf556b921956c2bc24659cd5128401",
+ "a298c544c9c3b27924b4c23cc687ea5a",
+ "539e2df267782ce61c70103b23b7d922",
+ "3b0cb2a0b5d384efee4d81401025bec1",
+ // 128xN
+ "8b56b636dd712c2f8d138badb7219991",
+ "8cfc8836908902b8f915639b7bff45b3",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
+
+TEST_P(DistanceWeightedBlendTest8bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDistanceWeightedBlendDigest10bpp(const BlockSize block_size) {
+ static const char* const kDigests[] = {
+ // 4xN
+ "55f594b56e16d5c401274affebbcc3d3",
+ "69df14da4bb33a8f7d7087921008e919",
+ "1b61f33604c54015794198a13bfebf46",
+ // 8xN
+ "825a938185b152f7cf09bf1c0723ce2b",
+ "85ea315c51d979bc9b45834d6b40ec6f",
+ "92ebde208e8c39f7ec6de2de82182dbb",
+ "520f84716db5b43684dbb703806383fe",
+ // 16xN
+ "12ca23e3e2930005a0511646e8c83da4",
+ "6208694a6744f4a3906f58c1add670e3",
+ "a33d63889df989a3bbf84ff236614267",
+ "34830846ecb0572a98bbd192fed02b16",
+ "34bb2f79c0bd7f9a80691b8af597f2a8",
+ // 32xN
+ "fa97f2d0e3143f1f44d3ac018b0d696d",
+ "3df4a22456c9ab6ed346ab1b9750ae7d",
+ "6276a058b35c6131bc0c94a4b4a37ebc",
+ "9ca42da5d2d5eb339df03ae2c7a26914",
+ // 64xN
+ "800e692c520f99223bc24c1ac95a0166",
+ "818b6d20426585ef7fe844015a03aaf5",
+ "fb48691ccfff083e01d74826e88e613f",
+ "0bd350bc5bc604a224d77a5f5a422698",
+ // 128xN
+ "a130840813cd6bd69d09bcf5f8d0180f",
+ "6ece1846bea55e8f8f2ed7fbf73718de",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest10bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDistanceWeightedBlendDigest12bpp(const BlockSize block_size) {
+ static const char* const kDigests[] = {
+ // 4xN
+ "e30bf8f5f294206ad1dd79bd10a20827",
+ "f0cfb60134562d9c5f2ec6ad106e01ef",
+ "ad0876244e1b769203266a9c75b74afc",
+ // 8xN
+ "5265b954479c15a80f427561c5f36ff4",
+ "7f157457d1671e4ecce7a0884e9e3f76",
+ "d2cef5cf217f2d1f787c8951b7fe7cb2",
+ "6d23059008adbbb84ac941c8b4968f5b",
+ // 16xN
+ "ae521a5656ed3440d1fa950c20d90a79",
+ "935bec0e12b5dd3e0c34b3de8ba51476",
+ "0334bafcdcd7ddddb673ded492bca25a",
+ "c5360f08d0be77c79dc19fb55a0c5fe0",
+ "c2d1e7a4244a8aaaac041aed0cefc148",
+ // 32xN
+ "ce7f3cf78ae4f836cf69763137f7f6a6",
+ "800e52ebb14d5831c047d391cd760f95",
+ "74aa2b412b42165f1967daf3042b4f17",
+ "140d4cc600944b629b1991e88a9fe97c",
+ // 64xN
+ "3d206f93229ee2cea5c5da4e0ac6445a",
+ "3d13028f8fffe79fd35752c0177291ca",
+ "e7a7669acb5979dc7b15a19eed09cd4c",
+ "599368f4971c203fc5fa32989fe8cb44",
+ // 128xN
+ "54b46af2e2c8d2081e26fa0315b4ffd7",
+ "602e769bb2104e78223e68e50e7e86a0",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest12bpp = DistanceWeightedBlendTest<12, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest12bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest12bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest12bpp,
+ testing::ValuesIn(kTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+ return os << ToString(param);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/average_blend.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/convolve.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/film_grain.h"
+#include "src/dsp/intra_edge.h"
+#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/dsp/loop_filter.h"
+#include "src/dsp/loop_restoration.h"
+#include "src/dsp/mask_blend.h"
+#include "src/dsp/motion_field_projection.h"
+#include "src/dsp/motion_vector_search.h"
+#include "src/dsp/obmc.h"
+#include "src/dsp/super_res.h"
+#include "src/dsp/warp.h"
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp_internal {
+
+void DspInit_C() {
+ dsp::AverageBlendInit_C();
+ dsp::CdefInit_C();
+ dsp::ConvolveInit_C();
+ dsp::DistanceWeightedBlendInit_C();
+ dsp::FilmGrainInit_C();
+ dsp::IntraEdgeInit_C();
+ dsp::IntraPredCflInit_C();
+ dsp::IntraPredDirectionalInit_C();
+ dsp::IntraPredFilterInit_C();
+ dsp::IntraPredInit_C();
+ dsp::IntraPredSmoothInit_C();
+ dsp::InverseTransformInit_C();
+ dsp::LoopFilterInit_C();
+ dsp::LoopRestorationInit_C();
+ dsp::MaskBlendInit_C();
+ dsp::MotionFieldProjectionInit_C();
+ dsp::MotionVectorSearchInit_C();
+ dsp::ObmcInit_C();
+ dsp::SuperResInit_C();
+ dsp::WarpInit_C();
+ dsp::WeightMaskInit_C();
+}
+
+dsp::Dsp* GetWritableDspTable(int bitdepth) {
+ switch (bitdepth) {
+ case 8: {
+ static dsp::Dsp dsp_8bpp;
+ return &dsp_8bpp;
+ }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10: {
+ static dsp::Dsp dsp_10bpp;
+ return &dsp_10bpp;
+ }
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12: {
+ static dsp::Dsp dsp_12bpp;
+ return &dsp_12bpp;
+ }
+#endif
+ }
+ return nullptr;
+}
+
+} // namespace dsp_internal
+
+namespace dsp {
+
+void DspInit() {
+ static std::once_flag once;
+ std::call_once(once, []() {
+ dsp_internal::DspInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+ const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
+ if ((cpu_features & kSSE4_1) != 0) {
+ AverageBlendInit_SSE4_1();
+ CdefInit_SSE4_1();
+ ConvolveInit_SSE4_1();
+ DistanceWeightedBlendInit_SSE4_1();
+ FilmGrainInit_SSE4_1();
+ IntraEdgeInit_SSE4_1();
+ IntraPredCflInit_SSE4_1();
+ IntraPredDirectionalInit_SSE4_1();
+ IntraPredFilterInit_SSE4_1();
+ IntraPredInit_SSE4_1();
+ IntraPredCflInit_SSE4_1();
+ IntraPredSmoothInit_SSE4_1();
+ InverseTransformInit_SSE4_1();
+ LoopFilterInit_SSE4_1();
+ LoopRestorationInit_SSE4_1();
+ MaskBlendInit_SSE4_1();
+ MotionFieldProjectionInit_SSE4_1();
+ MotionVectorSearchInit_SSE4_1();
+ ObmcInit_SSE4_1();
+ SuperResInit_SSE4_1();
+ WarpInit_SSE4_1();
+ WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+ if ((cpu_features & kAVX2) != 0) {
+ CdefInit_AVX2();
+ ConvolveInit_AVX2();
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif // LIBGAV1_ENABLE_AVX2
+#endif // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+#if LIBGAV1_ENABLE_NEON
+ AverageBlendInit_NEON();
+ CdefInit_NEON();
+ ConvolveInit_NEON();
+ DistanceWeightedBlendInit_NEON();
+ FilmGrainInit_NEON();
+ IntraEdgeInit_NEON();
+ IntraPredCflInit_NEON();
+ IntraPredDirectionalInit_NEON();
+ IntraPredFilterInit_NEON();
+ IntraPredInit_NEON();
+ IntraPredSmoothInit_NEON();
+ InverseTransformInit_NEON();
+ LoopFilterInit_NEON();
+ LoopRestorationInit_NEON();
+ MaskBlendInit_NEON();
+ MotionFieldProjectionInit_NEON();
+ MotionVectorSearchInit_NEON();
+ ObmcInit_NEON();
+ SuperResInit_NEON();
+ WarpInit_NEON();
+ WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ ConvolveInit10bpp_NEON();
+ InverseTransformInit10bpp_NEON();
+ LoopFilterInit10bpp_NEON();
+ LoopRestorationInit10bpp_NEON();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#endif // LIBGAV1_ENABLE_NEON
+ });
+}
+
+const Dsp* GetDspTable(int bitdepth) {
+ return dsp_internal::GetWritableDspTable(bitdepth);
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DSP_H_
+#define LIBGAV1_SRC_DSP_DSP_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+
+#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
+#endif
+
+enum IntraPredictor : uint8_t {
+ kIntraPredictorDcFill,
+ kIntraPredictorDcTop,
+ kIntraPredictorDcLeft,
+ kIntraPredictorDc,
+ kIntraPredictorVertical,
+ kIntraPredictorHorizontal,
+ kIntraPredictorPaeth,
+ kIntraPredictorSmooth,
+ kIntraPredictorSmoothVertical,
+ kIntraPredictorSmoothHorizontal,
+ kNumIntraPredictors
+};
+
+// List of valid 1D transforms.
+enum Transform1d : uint8_t {
+ kTransform1dDct, // Discrete Cosine Transform.
+ kTransform1dAdst, // Asymmetric Discrete Sine Transform.
+ kTransform1dIdentity,
+ kTransform1dWht, // Walsh Hadamard Transform.
+ kNumTransform1ds
+};
+
+// List of valid 1D transform sizes. Not all transforms may be available for all
+// the sizes.
+enum Transform1dSize : uint8_t {
+ kTransform1dSize4,
+ kTransform1dSize8,
+ kTransform1dSize16,
+ kTransform1dSize32,
+ kTransform1dSize64,
+ kNumTransform1dSizes
+};
+
+// The maximum width of the loop filter, fewer pixels may be filtered depending
+// on strength thresholds.
+enum LoopFilterSize : uint8_t {
+ kLoopFilterSize4,
+ kLoopFilterSize6,
+ kLoopFilterSize8,
+ kLoopFilterSize14,
+ kNumLoopFilterSizes
+};
+
+enum : uint8_t {
+ kRow = 0,
+ kColumn = 1,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const IntraPredictor predictor) {
+ switch (predictor) {
+ case kIntraPredictorDcFill:
+ return "kIntraPredictorDcFill";
+ case kIntraPredictorDcTop:
+ return "kIntraPredictorDcTop";
+ case kIntraPredictorDcLeft:
+ return "kIntraPredictorDcLeft";
+ case kIntraPredictorDc:
+ return "kIntraPredictorDc";
+ case kIntraPredictorVertical:
+ return "kIntraPredictorVertical";
+ case kIntraPredictorHorizontal:
+ return "kIntraPredictorHorizontal";
+ case kIntraPredictorPaeth:
+ return "kIntraPredictorPaeth";
+ case kIntraPredictorSmooth:
+ return "kIntraPredictorSmooth";
+ case kIntraPredictorSmoothVertical:
+ return "kIntraPredictorSmoothVertical";
+ case kIntraPredictorSmoothHorizontal:
+ return "kIntraPredictorSmoothHorizontal";
+ case kNumIntraPredictors:
+ return "kNumIntraPredictors";
+ }
+ abort();
+}
+
+inline const char* ToString(const Transform1d transform) {
+ switch (transform) {
+ case kTransform1dDct:
+ return "kTransform1dDct";
+ case kTransform1dAdst:
+ return "kTransform1dAdst";
+ case kTransform1dIdentity:
+ return "kTransform1dIdentity";
+ case kTransform1dWht:
+ return "kTransform1dWht";
+ case kNumTransform1ds:
+ return "kNumTransform1ds";
+ }
+ abort();
+}
+
+inline const char* ToString(const Transform1dSize transform_size) {
+ switch (transform_size) {
+ case kTransform1dSize4:
+ return "kTransform1dSize4";
+ case kTransform1dSize8:
+ return "kTransform1dSize8";
+ case kTransform1dSize16:
+ return "kTransform1dSize16";
+ case kTransform1dSize32:
+ return "kTransform1dSize32";
+ case kTransform1dSize64:
+ return "kTransform1dSize64";
+ case kNumTransform1dSizes:
+ return "kNumTransform1dSizes";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopFilterSize filter_size) {
+ switch (filter_size) {
+ case kLoopFilterSize4:
+ return "kLoopFilterSize4";
+ case kLoopFilterSize6:
+ return "kLoopFilterSize6";
+ case kLoopFilterSize8:
+ return "kLoopFilterSize8";
+ case kLoopFilterSize14:
+ return "kLoopFilterSize14";
+ case kNumLoopFilterSizes:
+ return "kNumLoopFilterSizes";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopFilterType filter_type) {
+ switch (filter_type) {
+ case kLoopFilterTypeVertical:
+ return "kLoopFilterTypeVertical";
+ case kLoopFilterTypeHorizontal:
+ return "kLoopFilterTypeHorizontal";
+ case kNumLoopFilterTypes:
+ return "kNumLoopFilterTypes";
+ }
+ abort();
+}
+
+//------------------------------------------------------------------------------
+// Intra predictors. Section 7.11.2.
+// These require access to one or both of the top row and left column. Some may
+// access the top-left (top[-1]), top-right (top[width+N]), bottom-left
+// (left[height+N]) or upper-left (left[-1]).
+
+// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
+// 7.11.2.5, 7.11.2.6.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. top-left and bottom-left may be accessed.
+// The pointer arguments do not alias one another.
+using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, const void* left);
+using IntraPredictorFuncs =
+ IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
+
+// Directional intra predictor function signature, zone 1 (0 < angle < 90).
+// Section 7.11.2.4 (#7).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |width| and |height| give the dimensions of the block.
+// |xstep| is the scaled starting index to |top| from
+// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
+// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. top-right
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, int width,
+ int height, int xstep,
+ bool upsampled_top);
+
+// Directional intra predictor function signature, zone 2 (90 < angle < 180).
+// Section 7.11.2.4 (#8).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left of
+// |dst|. |width| and |height| give the dimensions of the block. |xstep| and
+// |ystep| are the scaled starting index to |top| and |left|, respectively,
+// from kDirectionalIntraPredictorDerivative. |upsampled_top| and
+// |upsampled_left| indicate whether |top| and |left| have been upsampled as
+// described in '7.11.2.11. Intra edge upsample process'. This can occur in
+// cases with |width| + |height| <= 16. top-left and upper-left are accessed,
+// up to [-2] in each if |upsampled_top/left| are set.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone2Func = void (*)(
+ void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
+ int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
+
+// Directional intra predictor function signature, zone 3 (180 < angle < 270).
+// Section 7.11.2.4 (#9).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
+// column to the left of |dst|. |width| and |height| give the dimensions of the
+// block. |ystep| is the scaled starting index to |left| from
+// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
+// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. bottom-left
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
+ const void* left, int width,
+ int height, int ystep,
+ bool upsampled_left);
+
+// Filter intra predictor function signature. Section 7.11.2.3.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. |width| and |height| are the size of the block in pixels.
+// The pointer arguments do not alias one another.
+using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+ const void* top, const void* left,
+ FilterIntraPredictor pred, int width,
+ int height);
+
+//------------------------------------------------------------------------------
+// Chroma from Luma (Cfl) prediction. Section 7.11.5.
+
+// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
+// unaligned pointer to the output block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
+// fractional bits of precision. |alpha| is the signed Cfl alpha value for the
+// appropriate plane.
+using CflIntraPredictorFunc = void (*)(
+ void* dst, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
+using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
+
+// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
+// pointer to the output block. |src| is an unaligned pointer to the input
+// block. Pixel size is determined by bitdepth with |stride| given in bytes.
+using CflSubsamplerFunc =
+ void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ int max_luma_width, int max_luma_height, const void* source,
+ ptrdiff_t stride);
+using CflSubsamplerFuncs =
+ CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
+
+//------------------------------------------------------------------------------
+// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
+
+// Intra edge filter function signature. |buffer| is a pointer to the top_row or
+// left_column that needs to be filtered. Typically the -1'th index of |top_row|
+// and |left_column| need to be filtered as well, so the caller can merely pass
+// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be filtered. |strength| is the
+// filter strength. Section 7.11.2.12 in the spec.
+using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
+
+// Intra edge upsampler function signature. |buffer| is a pointer to the top_row
+// or left_column that needs to be upsampled. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be upsampled; valid values are:
+// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
+// the |buffer|. Section 7.11.2.11 in the spec.
+using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
+
+//------------------------------------------------------------------------------
+// Inverse transform add function signature.
+//
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the destination frame
+// for the transform type and block size |tx_size| starting at position
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D of Pixel
+// values. |adjusted_tx_height| is the number of rows to process based on the
+// non-zero coefficient count in the block. It will be 1 (non-zero coefficient
+// count == 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less. |src_buffer| is a pointer to an Array2D of Residual
+// values. On input |src_buffer| contains the dequantized values, on output it
+// contains the residual.
+// The pointer arguments do not alias one another.
+using InverseTransformAddFunc = void (*)(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
+using InverseTransformAddFuncs =
+ InverseTransformAddFunc[kNumTransform1ds][kNumTransform1dSizes][2];
+
+//------------------------------------------------------------------------------
+// Post processing.
+
+// Loop filter function signature. Section 7.14.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes.
+// <threshold param> <spec name> <range>
+// |outer_thresh| blimit [7, 193]
+// |inner_thresh| limit [1, 63]
+// |hev_thresh| thresh [0, 63]
+// These are scaled by the implementation by 'bitdepth - 8' to produce
+// the spec variables blimitBd, limitBd and threshBd.
+// Note these functions are not called when the loop filter level is 0.
+using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+using LoopFilterFuncs =
+ LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
+
+// Cdef direction function signature. Section 7.15.2.
+// |src| is a pointer to the source block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |direction| and |variance| are output
+// parameters and must not be nullptr.
+// The pointer arguments do not alias one another.
+using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
+ uint8_t* direction, int* variance);
+
+// Cdef filtering function signature. Section 7.15.3.
+// |source| is a pointer to the input block padded with kCdefLargeValue if at a
+// frame border. |source_stride| is given in units of uint16_t.
+// |block_width|, |block_height| are the width/height of the input block.
+// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
+// parameters.
+// |direction| is the filtering direction.
+// |dest| is the output buffer. |dest_stride| is given in bytes.
+// The pointer arguments do not alias one another.
+using CdefFilteringFunc = void (*)(const uint16_t* source,
+ ptrdiff_t source_stride, int block_height,
+ int primary_strength, int secondary_strength,
+ int damping, int direction, void* dest,
+ ptrdiff_t dest_stride);
+
+// The first index is block width: [0]: 4, [1]: 8. The second is based on
+// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
+// |primary_strength| only, [2]: |secondary_strength| only.
+using CdefFilteringFuncs = CdefFilteringFunc[2][3];
+
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+ int initial_subpixel_x, int step,
+ void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
+// |dest| is the output buffer.
+// |dest_stride| is given in pixels.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// The pointer arguments do not alias one another.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+ ptrdiff_t source_stride, int height,
+ int downscaled_width, int upscaled_width,
+ int initial_subpixel_x, int step, void* dest,
+ ptrdiff_t dest_stride);
+
+// Loop restoration function signature. Sections 7.16, 7.17.
+// |restoration_info| contains loop restoration information, such as filter
+// type, strength.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
+// The pointer arguments do not alias one another.
+using LoopRestorationFunc = void (*)(
+ const RestorationUnitInfo& restoration_info, const void* source,
+ ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+ const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+ int height, RestorationBuffer* restoration_buffer, void* dest);
+
+// Index 0 is Wiener Filter.
+// Index 1 is Self Guided Restoration Filter.
+// This can be accessed as LoopRestorationType - 2.
+using LoopRestorationFuncs = LoopRestorationFunc[2];
+
+// Convolve function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
+ int horizontal_filter_index,
+ int vertical_filter_index,
+ int horizontal_filter_id, int vertical_filter_id,
+ int width, int height, void* prediction,
+ ptrdiff_t pred_stride);
+
+// Convolve functions signature. Each points to one convolve function with
+// a specific setting:
+// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
+// [has_horizontal_filter].
+// If is_compound is false, the prediction is clipped to Pixel.
+// If is_compound is true, the range of prediction is:
+// 8bpp: [-5132, 9212] (int16_t)
+// 10bpp: [ 3988, 61532] (uint16_t)
+// 12bpp: [ 3974, 61559] (uint16_t)
+// See src/dsp/convolve.cc
+using ConvolveFuncs = ConvolveFunc[2][2][2][2];
+
+// Convolve + scale function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveScaleFunc = void (*)(const void* reference,
+ ptrdiff_t reference_stride,
+ int horizontal_filter_index,
+ int vertical_filter_index, int subpixel_x,
+ int subpixel_y, int step_x, int step_y,
+ int width, int height, void* prediction,
+ ptrdiff_t pred_stride);
+
+// Convolve functions signature for scaling version.
+// 0: single predictor. 1: compound predictor.
+using ConvolveScaleFuncs = ConvolveScaleFunc[2];
+
+// Weight mask function signature. Section 7.11.3.12.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the prediction width and height.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |mask| is the output buffer. |mask_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using WeightMaskFunc = void (*)(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride);
+
+// Weight mask functions signature. The dimensions (in order) are:
+// * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
+// * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
+// * mask_is_inverse.
+using WeightMaskFuncs = WeightMaskFunc[6][6][2];
+
+// Average blending function signature.
+// Two predictors are averaged to generate the output.
+// Input predictor values are int16_t. Output type is uint8_t, with actual
+// range of Pixel value.
+// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using AverageBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1, int width,
+ int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Distance weighted blending function signature.
+// Weights are generated in Section 7.11.3.15.
+// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
+// This function takes two blocks (inter frame prediction) and produces a
+// weighted output.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |weight_0| is the weight for the first block. It is derived from the relative
+// distance of the first reference frame and the current frame.
+// |weight_1| is the weight for the second block. It is derived from the
+// relative distance of the second reference frame and the current frame.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1,
+ uint8_t weight_0, uint8_t weight_1,
+ int width, int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Mask blending function signature. Section 7.11.3.14.
+// This function takes two blocks and produces a blended output stored into the
+// output block |dest|. The blending is a weighted average process, controlled
+// by values of the mask.
+// |prediction_0| is the first input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the inter frame prediction. It is
+// int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// The stride for |prediction_0| is equal to |width|.
+// |prediction_1| is the second input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the intra frame prediction and uses
+// Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
+// It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// |prediction_stride_1| is the stride, given in units of [u]int16_t. When
+// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
+// equal to |width|.
+// |mask| is an integer array, whose value indicates the weight of the blending.
+// |mask_stride| is corresponding stride.
+// |width|, |height| are the same for both input blocks.
+// If it's inter_intra (or wedge_inter_intra), the valid range of block size is
+// [8x8, 32x32], no 4:1/1:4 blocks (Section 5.11.28). Otherwise (including
+// difference weighted prediction and compound average prediction), the valid
+// range is [8x8, 128x128].
+// If there's subsampling, the corresponding width and height are halved for
+// chroma planes.
+// |is_inter_intra| stands for the prediction mode. If it is true, one of the
+// prediction blocks is from intra prediction of current frame. Otherwise, two
+// prediction blocks are both inter frame predictions.
+// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
+// |dest| is the output block.
+// |dest_stride| is the corresponding stride for dest.
+// The pointer arguments do not alias one another.
+using MaskBlendFunc = void (*)(const void* prediction_0,
+ const void* prediction_1,
+ ptrdiff_t prediction_stride_1,
+ const uint8_t* mask, ptrdiff_t mask_stride,
+ int width, int height, void* dest,
+ ptrdiff_t dest_stride);
+
+// Mask blending functions signature. Each points to one function with
+// a specific setting:
+// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
+using MaskBlendFuncs = MaskBlendFunc[3][2];
+
+// This function is similar to the MaskBlendFunc. It is only used when
+// |is_inter_intra| is true and |bitdepth| == 8.
+// |prediction_[01]| are Pixel values (uint8_t).
+// |prediction_1| is also the output buffer.
+// The pointer arguments do not alias one another.
+using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ ptrdiff_t prediction_stride_1,
+ const uint8_t* mask,
+ ptrdiff_t mask_stride, int width,
+ int height);
+
+// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
+// is false, the function at index 0 must be used. Otherwise, the function at
+// index subsampling_x + subsampling_y must be used.
+using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
+
+// Obmc (overlapped block motion compensation) blending function signature.
+// Section 7.11.3.10.
+// This function takes two blocks and produces a blended output stored into the
+// first input block. The blending is a weighted average process, controlled by
+// values of the mask.
+// Obmc is not a compound mode. It is different from other compound blending,
+// in terms of precision. The current block is computed using convolution with
+// clipping to the range of pixel values. Its above and left blocks are also
+// clipped. Therefore obmc blending process doesn't need to clip the output.
+// |prediction| is the first input block, which will be overwritten.
+// |prediction_stride| is the stride, given in bytes.
+// |width|, |height| are the same for both input blocks. The range is [4x2,
+// 32x32] for kObmcDirectionVertical and [2x4, 32x32] for
+// kObmcDirectionHorizontal, see Section 7.11.3.9.
+// |obmc_prediction| is the second input block.
+// |obmc_prediction_stride| is its stride, given in bytes.
+// The pointer arguments do not alias one another.
+using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
+ int width, int height,
+ const void* obmc_prediction,
+ ptrdiff_t obmc_prediction_stride);
+using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
+
+// Warp function signature. Section 7.11.3.5.
+// This function applies warp filtering for each 8x8 block inside the current
+// coding block. The filtering process is similar to 2d convolve filtering.
+// The horizontal filter is applied followed by the vertical filter.
+// The function has to calculate corresponding pixel positions before and
+// after warping.
+// |source| is the input reference frame buffer.
+// |source_stride|, |source_width|, |source_height| are corresponding frame
+// stride, width, and height. |source_stride| is given in bytes.
+// |warp_params| is the matrix of warp motion: warp_params[i] = mN.
+// [x' (m2 m3 m0 [x
+// z . y' = m4 m5 m1 * y
+// 1] m6 m7 1) 1]
+// |subsampling_x/y| is the current frame's plane subsampling factor.
+// |block_start_x| and |block_start_y| are the starting position the current
+// coding block.
+// |block_width| and |block_height| are width and height of the current coding
+// block. |block_width| and |block_height| are at least 8.
+// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
+// comments in the definition of struct GlobalMotion for the range of their
+// values.
+// |dest| is the output buffer of type Pixel. The output values are clipped to
+// Pixel values.
+// |dest_stride| is the stride, in units of bytes.
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsVertical &
+// kInterRoundBitsVertical12bpp will be used.
+//
+// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
+// borders that extend the frame boundary pixels.
+// * The left and right borders must be at least 13 pixels wide. In addition,
+// Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
+// Therefore, there must be at least one extra padding byte after the right
+// border of the last row in the source buffer.
+// * The top and bottom borders must be at least 13 pixels high.
+// The pointer arguments do not alias one another.
+using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x,
+ int block_start_y, int block_width, int block_height,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta, void* dest, ptrdiff_t dest_stride);
+
+// Warp for compound predictions. Section 7.11.3.5.
+// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
+// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
+// is always 7 (kCompoundInterRoundBitsVertical).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsCompondVertical will be used.
+using WarpCompoundFunc = WarpFunc;
+
+constexpr int kNumAutoRegressionLags = 4;
+// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
+// Section 7.18.3.3, second code block
+// |params| are parameters read from frame header, mainly providing
+// auto_regression_coeff_y for the filter and auto_regression_shift to right
+// shift the filter sum by. Note: This method assumes
+// params.auto_regression_coeff_lag is not 0. Do not call this method if
+// params.auto_regression_coeff_lag is 0.
+using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+ void* luma_grain_buffer);
+// Function index is auto_regression_coeff_lag - 1.
+using LumaAutoRegressionFuncs =
+ LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
+
+// Applies an auto-regressive filter to the white noise in u_grain and v_grain.
+// Section 7.18.3.3, third code block
+// The |luma_grain_buffer| provides samples that are added to the autoregressive
+// sum when num_y_points > 0.
+// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
+// that were generated from the stored Gaussian sequence, and are overwritten
+// with the results of the autoregressive filter. |params| are parameters read
+// from frame header, mainly providing auto_regression_coeff_u and
+// auto_regression_coeff_v for each chroma plane's filter, and
+// auto_regression_shift to right shift the filter sums by.
+// The pointer arguments do not alias one another.
+using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+ const void* luma_grain_buffer,
+ int subsampling_x, int subsampling_y,
+ void* u_grain_buffer,
+ void* v_grain_buffer);
+using ChromaAutoRegressionFuncs =
+ ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
+
+// Build an image-wide "stripe" of grain noise for every 32 rows in the image.
+// Section 7.18.3.5, first code block.
+// Each 32x32 luma block is copied at a random offset specified via
+// |grain_seed| from the grain template produced by autoregression, and the same
+// is done for chroma grains, subject to subsampling.
+// |width| and |height| are the dimensions of the overall image.
+// |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
+// Because this function treats all planes identically and independently, it is
+// simplified to take one grain buffer at a time. This means duplicating some
+// random number generations, but that work can be reduced in other ways.
+// The pointer arguments do not alias one another.
+using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
+ int grain_seed, int width,
+ int height, int subsampling_x,
+ int subsampling_y,
+ void* noise_stripes_buffer);
+using ConstructNoiseStripesFuncs =
+ ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
+
+// Compute the one or two overlap rows for each stripe copied to the noise
+// image.
+// Section 7.18.3.5, second code block. |width| and |height| are the
+// dimensions of the overall image. |noise_stripes_buffer| points to an
+// Array2DView with one row for each stripe. |noise_image_buffer| points to an
+// Array2D containing the allocated plane for this frame. Because this function
+// treats all planes identically and independently, it is simplified to take one
+// grain buffer at a time.
+// The pointer arguments do not alias one another.
+using ConstructNoiseImageOverlapFunc =
+ void (*)(const void* noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y, void* noise_image_buffer);
+
+// Populate a scaling lookup table with interpolated values of a piecewise
+// linear function where values in |point_value| are mapped to the values in
+// |point_scaling|.
+// |num_points| can be between 0 and 15. When 0, the lookup table is set to
+// zero.
+// |point_value| and |point_scaling| have |num_points| valid elements.
+// The pointer arguments do not alias one another.
+using InitializeScalingLutFunc = void (*)(int num_points,
+ const uint8_t point_value[],
+ const uint8_t point_scaling[],
+ int16_t* scaling_lut,
+ const int scaling_lut_length);
+
+// Blend noise with image. Section 7.18.3.5, third code block.
+// |width| is the width of each row, while |height| is how many rows to compute.
+// |start_height| is an offset for the noise image, to support multithreading.
+// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
+// functions, according to the code in the spec.
+// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
+// frame. They are blended with the film grain noise and written to
+// |dest_plane_y| and |dest_plane_uv| as final output for display.
+// source_plane_* and dest_plane_* may point to the same buffer, in which case
+// the film grain noise is added in place.
+// |scaling_lut_y| and |scaling_lut| represent a piecewise linear mapping from
+// the frame's raw pixel value, to a scaling factor for the noise sample.
+// |scaling_shift| is applied as a right shift after scaling, so that scaling
+// down is possible. It is found in FilmGrainParams, but supplied directly to
+// BlendNoiseWithImageLumaFunc because it's the only member used.
+// The dest plane may point to the source plane, depending on the value of
+// frame_header.show_existing_frame. |noise_image_ptr| and scaling_lut.* do not
+// alias other arguments.
+using BlendNoiseWithImageLumaFunc = void (*)(
+ const void* noise_image_ptr, int min_value, int max_value,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y);
+
+using BlendNoiseWithImageChromaFunc = void (*)(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_value, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y, const int16_t* scaling_lut,
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv);
+
+using BlendNoiseWithImageChromaFuncs =
+ BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
+
+//------------------------------------------------------------------------------
+
+struct FilmGrainFuncs {
+ LumaAutoRegressionFuncs luma_auto_regression;
+ ChromaAutoRegressionFuncs chroma_auto_regression;
+ ConstructNoiseStripesFuncs construct_noise_stripes;
+ ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
+ InitializeScalingLutFunc initialize_scaling_lut;
+ BlendNoiseWithImageLumaFunc blend_noise_luma;
+ BlendNoiseWithImageChromaFuncs blend_noise_chroma;
+};
+
+// Motion field projection function signature. Section 7.9.
+// |reference_info| provides reference information for motion field projection.
+// |reference_to_current_with_sign| is the precalculated reference frame id
+// distance from current frame.
+// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
+// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
+// |x8_start| and |x8_end| are the start and end 8x8 columns of the current
+// tile.
+// |motion_field| is the output which saves the projected motion field
+// information.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MotionFieldProjectionKernelFunc = void (*)(
+ const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* motion_field);
+
+// Compound temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offsets| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionCompoundFunc = void (*)(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], int count,
+ CompoundMotionVector* candidate_mvs);
+
+// Single temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offset| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionSingleFunc = void (*)(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ int reference_offset, int count, MotionVector* candidate_mvs);
+
+struct Dsp {
+ AverageBlendFunc average_blend;
+ CdefDirectionFunc cdef_direction;
+ CdefFilteringFuncs cdef_filters;
+ CflIntraPredictorFuncs cfl_intra_predictors;
+ CflSubsamplerFuncs cfl_subsamplers;
+ ConvolveFuncs convolve;
+ ConvolveScaleFuncs convolve_scale;
+ DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
+ DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
+ DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+ DistanceWeightedBlendFunc distance_weighted_blend;
+ FilmGrainFuncs film_grain;
+ FilterIntraPredictorFunc filter_intra_predictor;
+ InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
+ IntraEdgeFilterFunc intra_edge_filter;
+ IntraEdgeUpsamplerFunc intra_edge_upsampler;
+ IntraPredictorFuncs intra_predictors;
+ InverseTransformAddFuncs inverse_transforms;
+ LoopFilterFuncs loop_filters;
+ LoopRestorationFuncs loop_restorations;
+ MaskBlendFuncs mask_blend;
+ MotionFieldProjectionKernelFunc motion_field_projection_kernel;
+ MvProjectionCompoundFunc mv_projection_compound[3];
+ MvProjectionSingleFunc mv_projection_single[3];
+ ObmcBlendFuncs obmc_blend;
+ SuperResCoefficientsFunc super_res_coefficients;
+ SuperResFunc super_res;
+ WarpCompoundFunc warp_compound;
+ WarpFunc warp;
+ WeightMaskFuncs weight_mask;
+};
+
+// Initializes function pointers based on build config and runtime
+// environment. Must be called once before first use. This function is
+// thread-safe.
+void DspInit();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist.
+const Dsp* GetDspTable(int bitdepth);
+
+} // namespace dsp
+
+namespace dsp_internal {
+
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+ (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
+// Returns true if a more highly optimized version of |func| is not defined for
+// the associated bitdepth or if it is forcibly enabled with
+// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
+// to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
+// with the module.
+// |func| is one of:
+// - FunctionName, e.g., SelfGuidedFilter.
+// - [sub-table-index1][...-indexN] e.g.,
+// TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
+// used as lookups with leading 'k' removed.
+//
+// NEON support is the only extension available for ARM and it is always
+// required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
+// true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_8BPP_SSE4_1(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
+#define DSP_ENABLED_10BPP_SSE4_1(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist. This version is meant for use by test or dsp/*Init() functions only.
+dsp::Dsp* GetWritableDspTable(int bitdepth);
+
+} // namespace dsp_internal
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_DSP_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#include "tests/utils.h"
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Maps 1D transform to the maximum valid size for the corresponding transform.
+constexpr int kMaxTransform1dSize[kNumTransform1ds] = {
+ kTransform1dSize64, // Dct.
+ kTransform1dSize16, // Adst.
+ kTransform1dSize32, // Identity.
+ kTransform1dSize4, // Wht.
+};
+
+void CheckTables(bool c_only) {
+#if LIBGAV1_MAX_BITDEPTH == 12
+ static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10, kBitdepth12};
+#elif LIBGAV1_MAX_BITDEPTH >= 10
+ static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
+#else
+ static constexpr int kBitdepths[] = {kBitdepth8};
+#endif
+
+ for (const auto& bitdepth : kBitdepths) {
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth));
+ for (int i = 0; i < kNumTransformSizes; ++i) {
+ for (int j = 0; j < kNumIntraPredictors; ++j) {
+ EXPECT_NE(dsp->intra_predictors[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr);
+ EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr);
+ EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr);
+ EXPECT_NE(dsp->filter_intra_predictor, nullptr);
+ for (int i = 0; i < kNumTransformSizes; ++i) {
+ if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) {
+ EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr)
+ << "index [" << i << "]";
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ } else {
+ EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr)
+ << "index [" << i << "]";
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ }
+ EXPECT_NE(dsp->intra_edge_filter, nullptr);
+ EXPECT_NE(dsp->intra_edge_upsampler, nullptr);
+ for (int i = 0; i < kNumTransform1ds; ++i) {
+ for (int j = 0; j < kNumTransform1dSizes; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ if (j <= kMaxTransform1dSize[i]) {
+ EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr)
+ << "index [" << i << "][" << j << "][" << k << "]";
+ } else {
+ EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr)
+ << "index [" << i << "][" << j << "][" << k << "]";
+ }
+ }
+ }
+ }
+ for (int i = 0; i < kNumLoopFilterSizes; ++i) {
+ for (int j = 0; j < kNumLoopFilterTypes; ++j) {
+ EXPECT_NE(dsp->loop_filters[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ for (int i = 0; i < 2; ++i) {
+ EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]";
+ }
+
+ bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON;
+#if LIBGAV1_ENABLE_SSE4_1
+ const uint32_t cpu_features = GetCpuInfo();
+ super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
+#endif
+ if (c_only || bitdepth == kBitdepth12) {
+ super_res_coefficients_is_nonnull = false;
+ }
+ if (super_res_coefficients_is_nonnull) {
+ EXPECT_NE(dsp->super_res_coefficients, nullptr);
+ } else {
+ EXPECT_EQ(dsp->super_res_coefficients, nullptr);
+ }
+
+ EXPECT_NE(dsp->super_res, nullptr);
+ EXPECT_NE(dsp->cdef_direction, nullptr);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ EXPECT_NE(dsp->cdef_filters[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ for (auto convolve_func : dsp->convolve_scale) {
+ EXPECT_NE(convolve_func, nullptr);
+ }
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ for (int l = 0; l < 2; ++l) {
+ for (int m = 0; m < 2; ++m) {
+ if (j == 1 && k == 1) {
+ EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr);
+ } else {
+ EXPECT_NE(dsp->convolve[j][k][l][m], nullptr);
+ }
+ }
+ }
+ }
+ }
+ for (const auto& m : dsp->mask_blend) {
+ for (int i = 0; i < 2; ++i) {
+ if (i == 0 || bitdepth >= 10) {
+ EXPECT_NE(m[i], nullptr);
+ } else {
+ EXPECT_EQ(m[i], nullptr);
+ }
+ }
+ }
+ for (const auto& m : dsp->inter_intra_mask_blend_8bpp) {
+ if (bitdepth == 8) {
+ EXPECT_NE(m, nullptr);
+ } else {
+ EXPECT_EQ(m, nullptr);
+ }
+ }
+ for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) {
+ const int width_index = k4x4WidthLog2[i] - 1;
+ const int height_index = k4x4HeightLog2[i] - 1;
+ // Only block sizes >= 8x8 are handled with this function.
+ if (width_index < 0 || height_index < 0) continue;
+
+ for (size_t j = 0; j < 2; ++j) {
+ EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr)
+ << ToString(static_cast<BlockSize>(i)) << " index [" << width_index
+ << "]"
+ << "[" << height_index << "][" << j << "]";
+ }
+ }
+
+ EXPECT_NE(dsp->average_blend, nullptr);
+ EXPECT_NE(dsp->distance_weighted_blend, nullptr);
+ for (int i = 0; i < kNumObmcDirections; ++i) {
+ EXPECT_NE(dsp->obmc_blend[i], nullptr)
+ << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]";
+ }
+ EXPECT_NE(dsp->warp, nullptr);
+ EXPECT_NE(dsp->warp_compound, nullptr);
+
+ for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) {
+ EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr)
+ << "index [" << i << "]";
+ }
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < kNumAutoRegressionLags; ++j) {
+ if (i == 0 && j == 0) {
+ EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+ << " index [" << i << "]"
+ << "[" << j << "]";
+ } else {
+ EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+ << " index [" << i << "]"
+ << "[" << j << "]";
+ }
+ }
+ EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr)
+ << "index [" << i << "]";
+ EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr)
+ << "index [" << i << "]";
+ }
+ EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr);
+ EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr);
+ EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr);
+
+ if (bitdepth == 8) {
+ EXPECT_NE(dsp->motion_field_projection_kernel, nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[0], nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[1], nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[2], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[0], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[1], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[2], nullptr);
+ } else {
+ EXPECT_EQ(dsp->motion_field_projection_kernel, nullptr);
+ EXPECT_EQ(dsp->mv_projection_compound[0], nullptr);
+ EXPECT_EQ(dsp->mv_projection_compound[1], nullptr);
+ EXPECT_EQ(dsp->mv_projection_compound[2], nullptr);
+ EXPECT_EQ(dsp->mv_projection_single[0], nullptr);
+ EXPECT_EQ(dsp->mv_projection_single[1], nullptr);
+ EXPECT_EQ(dsp->mv_projection_single[2], nullptr);
+ }
+ }
+}
+
+TEST(Dsp, TablesArePopulated) {
+ DspInit();
+ CheckTables(/*c_only=*/false);
+}
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+TEST(Dsp, TablesArePopulatedCOnly) {
+ test_utils::ResetDspTable(kBitdepth8);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ test_utils::ResetDspTable(kBitdepth10);
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ test_utils::ResetDspTable(kBitdepth12);
+#endif
+ dsp_internal::DspInit_C();
+ CheckTables(/*c_only=*/true);
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+TEST(Dsp, GetDspTable) {
+ EXPECT_EQ(GetDspTable(1), nullptr);
+ EXPECT_NE(GetDspTable(kBitdepth8), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth8), nullptr);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ EXPECT_NE(GetDspTable(kBitdepth10), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#else
+ EXPECT_EQ(GetDspTable(kBitdepth10), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ EXPECT_NE(GetDspTable(kBitdepth12), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
+#else
+ EXPECT_EQ(GetDspTable(kBitdepth12), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
+#endif
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+template <int bitdepth>
+void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[],
+ const uint8_t point_scaling[],
+ int16_t* scaling_lut,
+ const int scaling_lut_length) {
+ if (num_points == 0) {
+ memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+ return;
+ }
+ constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0;
+ static_assert(sizeof(scaling_lut[0]) == 2, "");
+ Memset(scaling_lut, point_scaling[0],
+ std::max(static_cast<int>(point_value[0]), 1) << index_shift);
+ for (int i = 0; i < num_points - 1; ++i) {
+ const int delta_y = point_scaling[i + 1] - point_scaling[i];
+ const int delta_x = point_value[i + 1] - point_value[i];
+ const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+ for (int x = 0; x < delta_x; ++x) {
+ const int v = point_scaling[i] + ((x * delta + 32768) >> 16);
+ assert(v >= 0 && v <= UINT8_MAX);
+ const int lut_index = (point_value[i] + x) << index_shift;
+ scaling_lut[lut_index] = v;
+ }
+ }
+ const int16_t last_point_value = point_value[num_points - 1];
+ const int x_base = last_point_value << index_shift;
+ Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+ scaling_lut_length - x_base);
+ // Fill in the gaps.
+ if (bitdepth == kBitdepth10) {
+ for (int x = 4; x < x_base + 4; x += 4) {
+ const int start = scaling_lut[x - 4];
+ const int end = scaling_lut[x];
+ const int delta = end - start;
+ scaling_lut[x - 3] = start + RightShiftWithRounding(delta, 2);
+ scaling_lut[x - 2] = start + RightShiftWithRounding(2 * delta, 2);
+ scaling_lut[x - 1] = start + RightShiftWithRounding(3 * delta, 2);
+ }
+ }
+}
+
+// Section 7.18.3.5.
+template <int bitdepth>
+int ScaleLut(const int16_t* scaling_lut, int index) {
+ if (bitdepth <= kBitdepth10) {
+ assert(index < kScalingLookupTableSize << (bitdepth - 2));
+ return scaling_lut[index];
+ }
+ // Performs a piecewise linear interpolation into the scaling table.
+ const int shift = bitdepth - kBitdepth8;
+ const int quotient = index >> shift;
+ const int remainder = index - (quotient << shift);
+ assert(quotient + 1 < kScalingLookupTableSize);
+ const int start = scaling_lut[quotient];
+ const int end = scaling_lut[quotient + 1];
+ return start + RightShiftWithRounding((end - start) * remainder, shift);
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType>
+void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
+ void* luma_grain_buffer) {
+ auto* luma_grain = static_cast<GrainType*>(luma_grain_buffer);
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int auto_regression_coeff_lag = params.auto_regression_coeff_lag;
+ assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3);
+ // A pictorial representation of the auto-regressive filter for various values
+ // of auto_regression_coeff_lag. The letter 'O' represents the current sample.
+ // (The filter always operates on the current sample with filter
+ // coefficient 1.) The letters 'X' represent the neighboring samples that the
+ // filter operates on.
+ //
+ // auto_regression_coeff_lag == 3:
+ // X X X X X X X
+ // X X X X X X X
+ // X X X X X X X
+ // X X X O
+ // auto_regression_coeff_lag == 2:
+ // X X X X X
+ // X X X X X
+ // X X O
+ // auto_regression_coeff_lag == 1:
+ // X X X
+ // X O
+ // auto_regression_coeff_lag == 0:
+ // O
+ //
+ // Note that if auto_regression_coeff_lag is 0, the filter is the identity
+ // filter and therefore can be skipped. This implementation assumes it is not
+ // called in that case.
+ const int shift = params.auto_regression_shift;
+ for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) {
+ for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder;
+ ++x) {
+ int sum = 0;
+ int pos = 0;
+ int delta_row = -auto_regression_coeff_lag;
+ // The last iteration (delta_row == 0) is shorter and is handled
+ // separately.
+ do {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ const int coeff = params.auto_regression_coeff_y[pos];
+ sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] *
+ coeff;
+ ++pos;
+ } while (++delta_column <= auto_regression_coeff_lag);
+ } while (++delta_row < 0);
+ // Last iteration: delta_row == 0.
+ {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ const int coeff = params.auto_regression_coeff_y[pos];
+ sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff;
+ ++pos;
+ } while (++delta_column < 0);
+ }
+ luma_grain[y * kLumaWidth + x] = Clip3(
+ luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift),
+ grain_min, grain_max);
+ }
+ }
+}
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+ bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_C(
+ const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+ int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+ void* LIBGAV1_RESTRICT v_grain_buffer) {
+ static_assert(
+ auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
+ "Unsupported autoregression lag for chroma.");
+ const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int grain_max = GetGrainMax<bitdepth>();
+ auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+ auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+ const int shift = params.auto_regression_shift;
+ const int chroma_height =
+ (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+ const int chroma_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ for (int y = kAutoRegressionBorder; y < chroma_height; ++y) {
+ const int luma_y =
+ ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder;
+ for (int x = kAutoRegressionBorder;
+ x < chroma_width - kAutoRegressionBorder; ++x) {
+ int sum_u = 0;
+ int sum_v = 0;
+ int pos = 0;
+ int delta_row = -auto_regression_coeff_lag;
+ do {
+ int delta_column = -auto_regression_coeff_lag;
+ do {
+ if (delta_row == 0 && delta_column == 0) {
+ break;
+ }
+ const int coeff_u = params.auto_regression_coeff_u[pos];
+ const int coeff_v = params.auto_regression_coeff_v[pos];
+ sum_u +=
+ u_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+ coeff_u;
+ sum_v +=
+ v_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+ coeff_v;
+ ++pos;
+ } while (++delta_column <= auto_regression_coeff_lag);
+ } while (++delta_row <= 0);
+ if (use_luma) {
+ int luma = 0;
+ const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) +
+ kAutoRegressionBorder;
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
+ } while (++j <= subsampling_x);
+ } while (++i <= subsampling_y);
+ luma = SubsampledValue(luma, subsampling_x + subsampling_y);
+ const int coeff_u = params.auto_regression_coeff_u[pos];
+ const int coeff_v = params.auto_regression_coeff_v[pos];
+ sum_u += luma * coeff_u;
+ sum_v += luma * coeff_v;
+ }
+ u_grain[y * chroma_width + x] = Clip3(
+ u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift),
+ grain_min, grain_max);
+ v_grain[y * chroma_width + x] = Clip3(
+ v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift),
+ grain_min, grain_max);
+ }
+ }
+}
+
+// This implementation is for the condition overlap_flag == false.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripes_C(const void* LIBGAV1_RESTRICT grain_buffer,
+ int grain_seed, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+ auto* noise_stripes =
+ static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+ const auto* grain = static_cast<const GrainType*>(grain_buffer);
+ const int half_width = DivideBy2(width + 1);
+ const int half_height = DivideBy2(height + 1);
+ assert(half_width > 0);
+ assert(half_height > 0);
+ static_assert(kLumaWidth == kMaxChromaWidth,
+ "kLumaWidth width should be equal to kMaxChromaWidth");
+ const int grain_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ constexpr int kNoiseStripeHeight = 34;
+ int luma_num = 0;
+ int y = 0;
+ do {
+ GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+ uint16_t seed = grain_seed;
+ seed ^= ((luma_num * 37 + 178) & 255) << 8;
+ seed ^= ((luma_num * 173 + 105) & 255);
+ int x = 0;
+ do {
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ int i = 0;
+ do {
+ // Section 7.18.3.5 says:
+ // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+ // wide (a few additional samples across are actually written to
+ // the array, but these are never read) ...
+ //
+ // Note: The warning in the parentheses also applies to
+ // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+ //
+ // Writes beyond the width of each row could happen below. To
+ // prevent those writes, we clip the number of pixels to copy against
+ // the remaining width.
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x,
+ plane_width - (x << (1 - subsampling_x)));
+ memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ x += 16;
+ } while (x < half_width);
+
+ ++luma_num;
+ y += 16;
+ } while (y < half_height);
+}
+
+// This implementation is for the condition overlap_flag == true.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripesWithOverlap_C(
+ const void* LIBGAV1_RESTRICT grain_buffer, int grain_seed, int width,
+ int height, int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+ auto* noise_stripes =
+ static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+ const auto* grain = static_cast<const GrainType*>(grain_buffer);
+ const int half_width = DivideBy2(width + 1);
+ const int half_height = DivideBy2(height + 1);
+ assert(half_width > 0);
+ assert(half_height > 0);
+ static_assert(kLumaWidth == kMaxChromaWidth,
+ "kLumaWidth width should be equal to kMaxChromaWidth");
+ const int grain_width =
+ (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ constexpr int kNoiseStripeHeight = 34;
+ int luma_num = 0;
+ int y = 0;
+ do {
+ GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+ uint16_t seed = grain_seed;
+ seed ^= ((luma_num * 37 + 178) & 255) << 8;
+ seed ^= ((luma_num * 173 + 105) & 255);
+ // Begin special iteration for x == 0.
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ // The overlap computation only occurs when x > 0, so it is omitted here.
+ int i = 0;
+ do {
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x, plane_width);
+ memcpy(&noise_stripe[i * plane_width],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ // End special iteration for x == 0.
+ for (int x = 16; x < half_width; x += 16) {
+ const int rand = GetFilmGrainRandomNumber(8, &seed);
+ const int offset_x = rand >> 4;
+ const int offset_y = rand & 15;
+ const int plane_offset_x =
+ (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+ const int plane_offset_y =
+ (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+ int i = 0;
+ do {
+ int j = 0;
+ int grain_sample =
+ grain[(plane_offset_y + i) * grain_width + plane_offset_x];
+ // The first pixel(s) of each segment of the noise_stripe are subject to
+ // the "overlap" computation.
+ if (subsampling_x == 0) {
+ // Corresponds to the line in the spec:
+ // if (j < 2 && x > 0)
+ // j = 0
+ int old = noise_stripe[i * plane_width + x * 2];
+ grain_sample = old * 27 + grain_sample * 17;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x * 2] = grain_sample;
+
+ // This check prevents overwriting for the iteration j = 1. The
+ // continue applies to the i-loop.
+ if (x * 2 + 1 >= plane_width) continue;
+ // j = 1
+ grain_sample =
+ grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1];
+ old = noise_stripe[i * plane_width + x * 2 + 1];
+ grain_sample = old * 17 + grain_sample * 27;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x * 2 + 1] = grain_sample;
+ j = 2;
+ } else {
+ // Corresponds to the line in the spec:
+ // if (j == 0 && x > 0)
+ const int old = noise_stripe[i * plane_width + x];
+ grain_sample = old * 23 + grain_sample * 22;
+ grain_sample =
+ Clip3(RightShiftWithRounding(grain_sample, 5),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+ noise_stripe[i * plane_width + x] = grain_sample;
+ j = 1;
+ }
+ // The following covers the rest of the loop over j as described in the
+ // spec.
+ //
+ // Section 7.18.3.5 says:
+ // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+ // wide (a few additional samples across are actually written to
+ // the array, but these are never read) ...
+ //
+ // Note: The warning in the parentheses also applies to
+ // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+ //
+ // Writes beyond the width of each row could happen below. To
+ // prevent those writes, we clip the number of pixels to copy against
+ // the remaining width.
+ const int copy_size =
+ std::min(kNoiseStripeHeight >> subsampling_x,
+ plane_width - (x << (1 - subsampling_x))) -
+ j;
+ memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j],
+ &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j],
+ copy_size * sizeof(noise_stripe[0]));
+ } while (++i < (kNoiseStripeHeight >> subsampling_y));
+ }
+
+ ++luma_num;
+ y += 16;
+ } while (y < half_height);
+}
+
+template <int bitdepth, typename GrainType>
+inline void WriteOverlapLine_C(
+ const GrainType* LIBGAV1_RESTRICT noise_stripe_row,
+ const GrainType* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+ int grain_coeff, int old_coeff,
+ GrainType* LIBGAV1_RESTRICT noise_image_row) {
+ int x = 0;
+ do {
+ int grain = noise_stripe_row[x];
+ const int old = noise_stripe_row_prev[x];
+ grain = old * old_coeff + grain * grain_coeff;
+ grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin<bitdepth>(),
+ GetGrainMax<bitdepth>());
+ noise_image_row[x] = grain;
+ } while (++x < plane_width);
+}
+
+template <int bitdepth, typename GrainType>
+void ConstructNoiseImageOverlap_C(
+ const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ // Begin complete stripes section. This is when we are guaranteed to have
+ // two overlap rows in each stripe.
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ // First overlap row.
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[32 * plane_width],
+ plane_width, 17, 27, (*noise_image)[y]);
+ // Second overlap row.
+ WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, 27, 17, (*noise_image)[y + 1]);
+ }
+ // End complete stripes section.
+
+ const int remaining_height = plane_height - y;
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ if (remaining_height <= 0) {
+ return;
+ }
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[32 * plane_width],
+ plane_width, 17, 27, (*noise_image)[y]);
+
+ // Check if second overlap row is in the image.
+ if (remaining_height > 1) {
+ WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, 27, 17, (*noise_image)[y + 1]);
+ }
+ } else { // |subsampling_y| == 1
+ // No special checks needed for partial stripes, because if one exists, the
+ // first and only overlap row is guaranteed to exist.
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine_C<bitdepth>(noise_stripe,
+ &noise_stripe_prev[16 * plane_width],
+ plane_width, 22, 23, (*noise_image)[y]);
+ }
+ }
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_C(const void* LIBGAV1_RESTRICT noise_image_ptr,
+ int min_value, int max_luma, int scaling_shift,
+ int width, int height, int start_height,
+ const int16_t* scaling_lut_y,
+ const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y,
+ ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int orig = in_y[y * source_stride_y + x];
+ int noise = noise_image[kPlaneY][y + start_height][x];
+ noise = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut_y, orig) * noise, scaling_shift);
+ out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma);
+ } while (++x < width);
+ } while (++y < height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChroma_C(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut_uv,
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+
+ const int scaling_shift = params.chroma_scaling;
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int luma_y = y << subsampling_y;
+ const int luma_next_x = std::min(luma_x + 1, width - 1);
+ int average_luma;
+ if (subsampling_x != 0) {
+ average_luma = RightShiftWithRounding(
+ in_y[luma_y * source_stride_y + luma_x] +
+ in_y[luma_y * source_stride_y + luma_next_x],
+ 1);
+ } else {
+ average_luma = in_y[luma_y * source_stride_y + luma_x];
+ }
+ const int orig = in_uv[y * source_stride_uv + x];
+ const int combined = average_luma * luma_multiplier + orig * multiplier;
+ const int merged =
+ Clip3((combined >> 6) + LeftShift(offset, bitdepth - kBitdepth8), 0,
+ (1 << bitdepth) - 1);
+ int noise = noise_image[plane][y + start_height][x];
+ noise = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut_uv, merged) * noise, scaling_shift);
+ out_uv[y * dest_stride_uv + x] =
+ Clip3(orig + noise, min_value, max_chroma);
+ } while (++x < chroma_width);
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_C(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut, const void* source_plane_y,
+ ptrdiff_t source_stride_y, const void* source_plane_uv,
+ ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int scaling_shift = params.chroma_scaling;
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int luma_y = y << subsampling_y;
+ const int luma_next_x = std::min(luma_x + 1, width - 1);
+ int average_luma;
+ if (subsampling_x != 0) {
+ average_luma = RightShiftWithRounding(
+ in_y[luma_y * source_stride_y + luma_x] +
+ in_y[luma_y * source_stride_y + luma_next_x],
+ 1);
+ } else {
+ average_luma = in_y[luma_y * source_stride_y + luma_x];
+ }
+ const int orig_uv = in_uv[y * source_stride_uv + x];
+ int noise_uv = noise_image[plane][y + start_height][x];
+ noise_uv = RightShiftWithRounding(
+ ScaleLut<bitdepth>(scaling_lut, average_luma) * noise_uv,
+ scaling_shift);
+ out_uv[y * dest_stride_uv + x] =
+ Clip3(orig_uv + noise_uv, min_value, max_chroma);
+ } while (++x < chroma_width);
+ } while (++y < chroma_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth8>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth8>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth10>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth10>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth12>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth12>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace film_grain
+
+void FilmGrainInit_C() {
+ film_grain::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ film_grain::Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/film_grain_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize Dsp::film_grain_synthesis. This function is not thread-safe.
+void FilmGrainInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_H_
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+template <int bitdepth>
+int GetGrainMax() {
+ return (1 << (bitdepth - 1)) - 1;
+}
+
+template <int bitdepth>
+int GetGrainMin() {
+ return -(1 << (bitdepth - 1));
+}
+
+inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) {
+ uint16_t s = *seed;
+ uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1;
+ s = (s >> 1) | (bit << 15);
+ *seed = s;
+ return s >> (16 - bits);
+}
+
+enum {
+ kAutoRegressionBorder = 3,
+ // The width of the luma noise array.
+ kLumaWidth = 82,
+ // The height of the luma noise array.
+ kLumaHeight = 73,
+ // The two possible widths of the chroma noise array.
+ kMinChromaWidth = 44,
+ kMaxChromaWidth = 82,
+ // The two possible heights of the chroma noise array.
+ kMinChromaHeight = 38,
+ kMaxChromaHeight = 73,
+ // The standard scaling lookup table maps bytes to bytes, so only uses 256
+ // elements, plus one for overflow in 12bpp lookups. The size is scaled up for
+ // 10bpp.
+ kScalingLookupTableSize = 257,
+ // Padding is added to the scaling lookup table to permit overwrites by
+ // InitializeScalingLookupTable_NEON.
+ kScalingLookupTablePadding = 6,
+ // Padding is added to each row of the noise image to permit overreads by
+ // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON.
+ kNoiseImagePadding = 15,
+ // Padding is added to the end of the |noise_stripes_| buffer to permit
+ // overreads by WriteOverlapLine8bpp_NEON.
+ kNoiseStripePadding = 7,
+}; // anonymous enum
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+ {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxUpsampleSize = 16;
+
+template <typename Pixel>
+void IntraEdgeFilter_C(void* buffer, int size, int strength) {
+ assert(strength > 0);
+ Pixel edge[129];
+ memcpy(edge, buffer, sizeof(edge[0]) * size);
+ auto* const dst_buffer = static_cast<Pixel*>(buffer);
+ const int kernel_index = strength - 1;
+ for (int i = 1; i < size; ++i) {
+ int sum = 0;
+ for (int j = 0; j < kKernelTaps; ++j) {
+ const int k = Clip3(i + j - 2, 0, size - 1);
+ sum += kKernels[kernel_index][j] * edge[k];
+ }
+ dst_buffer[i] = RightShiftWithRounding(sum, 4);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsampler_C(void* buffer, int size) {
+ assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+ auto* const pixel_buffer = static_cast<Pixel*>(buffer);
+ Pixel temp[kMaxUpsampleSize + 3];
+ temp[0] = temp[1] = pixel_buffer[-1];
+ memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+ temp[size + 2] = pixel_buffer[size - 1];
+
+ pixel_buffer[-2] = temp[0];
+ for (int i = 0; i < size; ++i) {
+ const int sum =
+ -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3];
+ pixel_buffer[2 * i - 1] =
+ Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1);
+ pixel_buffer[2 * i] = temp[i + 2];
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void IntraEdgeInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intra_edge_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intra_edge_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRA_EDGE_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+const char kIntraEdge[] = "IntraEdge";
+const char kIntraEdgeFilterName[] = "Intra Edge Filter";
+const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler";
+
+constexpr int kIntraEdgeBufferSize = 144; // see Tile::IntraPrediction.
+constexpr int kIntraEdgeFilterTestMaxSize = 129;
+constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = {
+ 159, 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253,
+ 233, 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103,
+ 239, 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165,
+ 164, 63, 171, 143, 210, 236, 253, 233, 139, 113, 66, 211, 133, 61, 91,
+ 123, 187, 76, 110, 172, 61, 103, 239, 147, 247, 120, 18, 106, 180, 159,
+ 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253, 233,
+ 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103, 239,
+ 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165, 164,
+ 63, 171, 143, 210, 236, 253, 233, 139, 113,
+};
+constexpr int kIntraEdgeUpsamplerTestFixedInput[] = {
+ 208, 54, 136, 205, 124, 125, 165, 164, 63,
+ 171, 143, 210, 236, 208, 54, 136, 205};
+
+struct EdgeFilterParams {
+ int size;
+ int strength;
+};
+
+std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) {
+ return os << "size: " << param.size << ", strength: " << param.strength;
+}
+
+// Each size is paired with strength 1, 2, and 3.
+// In general, the size is expressible as 2^n+1, but all sizes up to 129 are
+// permissible.
+constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
+ {1, 1}, {1, 2}, {1, 3}, {2, 1}, {2, 2}, {2, 3}, {5, 1}, {5, 2},
+ {5, 3}, {9, 1}, {9, 2}, {9, 3}, {17, 1}, {17, 2}, {17, 3}, {33, 1},
+ {33, 2}, {33, 3}, {50, 1}, {50, 2}, {50, 3}, {55, 1}, {55, 2}, {55, 3},
+ {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}};
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraEdgeFilterTest() = default;
+ IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
+ IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
+ ~IntraEdgeFilterTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ IntraEdgeInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_intra_edge_filter_ = dsp->intra_edge_filter;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_intra_edge_filter_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraEdgeInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraEdgeInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+#if LIBGAV1_MSAN
+ // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+ // assembly code (safely) overreading to fill a register.
+ memset(buffer_, 0, sizeof(buffer_));
+#endif // LIBGAV1_MSAN
+ cur_intra_edge_filter_ = dsp->intra_edge_filter;
+ }
+
+ void TestFixedValues(const char* digest);
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[kIntraEdgeBufferSize];
+ Pixel base_buffer_[kIntraEdgeBufferSize];
+ int strength_ = GetParam().strength;
+ int size_ = GetParam().size;
+
+ IntraEdgeFilterFunc base_intra_edge_filter_;
+ IntraEdgeFilterFunc cur_intra_edge_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues(
+ const char* const digest) {
+ if (cur_intra_edge_filter_ == nullptr) return;
+ for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+ buffer_[i] = kIntraEdgeFilterTestFixedInput[i];
+ }
+ const absl::Time start = absl::Now();
+ cur_intra_edge_filter_(buffer_, size_, strength_);
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_,
+ kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (base_intra_edge_filter_ == nullptr) return;
+ if (cur_intra_edge_filter_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration elapsed_time;
+ absl::Duration base_elapsed_time;
+ memset(base_buffer_, 0, sizeof(base_buffer_));
+ memset(buffer_, 0, sizeof(buffer_));
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int i = 0; i < size_; ++i) {
+ const Pixel val = rnd(1 << bitdepth);
+ buffer_[i] = val;
+ base_buffer_[i] = val;
+ }
+ const absl::Time base_start = absl::Now();
+ base_intra_edge_filter_(base_buffer_, size_, strength_);
+ base_elapsed_time += absl::Now() - base_start;
+ const absl::Time start = absl::Now();
+ cur_intra_edge_filter_(buffer_, size_, strength_);
+ elapsed_time += absl::Now() - start;
+ }
+ if (num_runs > 1) {
+ printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n",
+ kIntraEdge, kIntraEdgeFilterName, size_, strength_,
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ absl::ToDoubleMicroseconds(base_elapsed_time) /
+ absl::ToDoubleMicroseconds(elapsed_time));
+ } else {
+ printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge,
+ kIntraEdgeFilterName, size_, strength_);
+ }
+ for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+ EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+ }
+}
+
+using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>;
+
+const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) {
+ static const char* const kDigestsSize1[3] = {
+ "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1",
+ "f7f681cf7047602fafc7fb416ecf46e1"};
+ static const char* const kDigestsSize2[3] = {
+ "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7",
+ "a7eb3dba95ff35c2df45a274afbc9772"};
+ static const char* const kDigestsSize5[3] = {
+ "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22",
+ "d313523d3b7646fdbb873c61ffe7a51a"};
+ static const char* const kDigestsSize9[3] = {
+ "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272",
+ "bdf4f16734c86338716fb436c196ecc6"};
+ static const char* const kDigestsSize17[3] = {
+ "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd",
+ "8f68603598638fa33203fe1233d273b1"};
+ static const char* const kDigestsSize33[3] = {
+ "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67",
+ "aca87680e0649d0728091c92c6de8871"};
+ static const char* const kDigestsSize50[3] = {
+ "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2",
+ "16a9148daf0e5f69808b9f0caa1ef110"};
+ static const char* const kDigestsSize55[3] = {
+ "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a",
+ "0318b2fde088c472215fe155f3b48d36"};
+ static const char* const kDigestsSize65[3] = {
+ "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3",
+ "d7c71db339c28d33119974987b2f9d85"};
+ static const char* const kDigestsSize129[3] = {
+ "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b",
+ "7306715602b0f5536771724a2f0a39bc"};
+
+ switch (size) {
+ case 1:
+ return kDigestsSize1[strength - 1];
+ case 2:
+ return kDigestsSize2[strength - 1];
+ case 5:
+ return kDigestsSize5[strength - 1];
+ case 9:
+ return kDigestsSize9[strength - 1];
+ case 17:
+ return kDigestsSize17[strength - 1];
+ case 33:
+ return kDigestsSize33[strength - 1];
+ case 50:
+ return kDigestsSize50[strength - 1];
+ case 55:
+ return kDigestsSize55[strength - 1];
+ case 65:
+ return kDigestsSize65[strength - 1];
+ case 129:
+ return kDigestsSize129[strength - 1];
+ default:
+ ADD_FAILURE() << "Unknown edge size: " << size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, Correctness) {
+ TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) {
+ static const char* const kDigestsSize1[3] = {
+ "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0",
+ "2d2088560e3ccb5b809c97f5299bb1c0"};
+ static const char* const kDigestsSize2[3] = {
+ "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181",
+ "bcdd1b21f3baf5f6f29caea9ef93fb0c"};
+ static const char* const kDigestsSize5[3] = {
+ "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2",
+ "48d516b06313683aca30e975ce6a3cad"};
+ static const char* const kDigestsSize9[3] = {
+ "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2",
+ "bb61aa9c5fa720c667a053769e7b7d08"};
+ static const char* const kDigestsSize17[3] = {
+ "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d",
+ "37d44d10a2545385af1da55f8c08564f"};
+ static const char* const kDigestsSize33[3] = {
+ "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1",
+ "994902f549eefd83fbcbf7ecb7dc5cca"};
+ static const char* const kDigestsSize50[3] = {
+ "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89",
+ "84e40aadcc416ef3f51cea3cc23b30c7"};
+ static const char* const kDigestsSize55[3] = {
+ "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d",
+ "7c320d55b11f93185b811bdaa379f2db"};
+ static const char* const kDigestsSize65[3] = {
+ "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f",
+ "d06e6b88585f7f1a1f6af5bb59ee2180"};
+ static const char* const kDigestsSize129[3] = {
+ "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738",
+ "500786d8561effec283d4f3d13886f8c"};
+
+ switch (size) {
+ case 1:
+ return kDigestsSize1[strength - 1];
+ case 2:
+ return kDigestsSize2[strength - 1];
+ case 5:
+ return kDigestsSize5[strength - 1];
+ case 9:
+ return kDigestsSize9[strength - 1];
+ case 17:
+ return kDigestsSize17[strength - 1];
+ case 33:
+ return kDigestsSize33[strength - 1];
+ case 50:
+ return kDigestsSize50[strength - 1];
+ case 55:
+ return kDigestsSize55[strength - 1];
+ case 65:
+ return kDigestsSize65[strength - 1];
+ case 129:
+ return kDigestsSize129[strength - 1];
+ default:
+ ADD_FAILURE() << "Unknown edge size: " << size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeFilterTest12bpp = IntraEdgeFilterTest<12, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest12bpp(int strength, int size) {
+ return GetIntraEdgeFilterDigest10bpp(strength, size);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeFilterDigest12bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraEdgeUpsamplerTest() = default;
+ IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
+ IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
+ ~IntraEdgeUpsamplerTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ IntraEdgeInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_intra_edge_upsampler_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraEdgeInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraEdgeInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+#if LIBGAV1_MSAN
+ // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+ // assembly code (safely) overreading to fill a register.
+ memset(buffer_, 0, sizeof(buffer_));
+#endif
+ }
+
+ void TestFixedValues(const char* digest);
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[128];
+ Pixel base_buffer_[128];
+ int size_ = GetParam();
+
+ IntraEdgeUpsamplerFunc base_intra_edge_upsampler_;
+ IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues(
+ const char* const digest) {
+ if (cur_intra_edge_upsampler_ == nullptr) return;
+ buffer_[0] = 0;
+ for (int i = 0; i < size_ + 1; ++i) {
+ buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i];
+ }
+ const absl::Time start = absl::Now();
+ cur_intra_edge_upsampler_(buffer_ + 2, size_);
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest,
+ buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (base_intra_edge_upsampler_ == nullptr) return;
+ if (cur_intra_edge_upsampler_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration base_elapsed_time;
+ absl::Duration elapsed_time;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ // Populate what will be buffer[-2..size] when passed to the upsample
+ // function.
+ buffer_[0] = 0;
+ base_buffer_[0] = 0;
+ for (int i = 1; i < size_ + 2; ++i) {
+ const Pixel val = rnd(1 << bitdepth);
+ buffer_[i] = val;
+ base_buffer_[i] = val;
+ }
+ const absl::Time base_start = absl::Now();
+ base_intra_edge_upsampler_(base_buffer_ + 2, size_);
+ base_elapsed_time += absl::Now() - base_start;
+ const absl::Time start = absl::Now();
+ cur_intra_edge_upsampler_(buffer_ + 2, size_);
+ elapsed_time += absl::Now() - start;
+ }
+ if (num_runs > 1) {
+ printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge,
+ kIntraEdgeUpsamplerName, size_,
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ absl::ToDoubleMicroseconds(base_elapsed_time) /
+ absl::ToDoubleMicroseconds(elapsed_time));
+ } else {
+ printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName,
+ size_);
+ }
+
+ for (int i = 0; i < size_ * 2 + 1; ++i) {
+ EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+ }
+}
+
+using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>;
+
+constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16};
+
+const char* GetIntraEdgeUpsampleDigest8bpp(int size) {
+ switch (size) {
+ case 4:
+ return "aa9002e03f8d15eb26bbee76f40bb923";
+ case 8:
+ return "cacfca86d65eff0d951eb21fc15f242a";
+ case 12:
+ return "0529e00a1fa80bc866fa7662ad2d7b9f";
+ case 16:
+ return "03e3b3e0ea438ea48ef05651c0a54986";
+ default:
+ ADD_FAILURE() << "Unknown upsample size: " << size;
+ return "";
+ }
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest10bpp(int size) {
+ switch (size) {
+ case 4:
+ return "341c6bb705a02bba65b34f92d8ca83cf";
+ case 8:
+ return "fdbe4b3b341921dcb0edf00dfc4d7667";
+ case 12:
+ return "ad69a491287495ec9973d4006d5ac461";
+ case 16:
+ return "04acf32e517d80ce4c4958e711b9b890";
+ default:
+ ADD_FAILURE() << "Unknown upsample size: " << size;
+ return "";
+ }
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeUpsamplerTest12bpp = IntraEdgeUpsamplerTest<12, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest12bpp(int size) {
+ return GetIntraEdgeUpsampleDigest10bpp(size);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest12bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest12bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest12bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct IntraPredFuncs_C {
+ IntraPredFuncs_C() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+// Intra-predictors that require bitdepth.
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+struct IntraPredBppFuncs_C {
+ IntraPredBppFuncs_C() = delete;
+
+ static void DcFill(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C::DcPred
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+ int sum = block_width >> 1; // rounder
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ for (int x = 0; x < block_width; ++x) sum += top[x];
+ const int dc = sum >> FloorLog2(block_width);
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+ int sum = block_height >> 1; // rounder
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ for (int y = 0; y < block_height; ++y) sum += left[y];
+ const int dc = sum >> FloorLog2(block_height);
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+// Note for square blocks the divide in the Dc() function reduces to a shift.
+// For rectangular block sizes the following multipliers can be used with the
+// corresponding shifts.
+// 8-bit
+// 1:2 (e.g,, 4x8): scale = 0x5556
+// 1:4 (e.g., 4x16): scale = 0x3334
+// final_descale = 16
+// 10/12-bit
+// 1:2: scale = 0xaaab
+// 1:4: scale = 0x6667
+// final_descale = 17
+// Note these may be halved to the values used in 8-bit in all cases except
+// when bitdepth == 12 and block_width + block_height is divisible by 5 (as
+// opposed to 3).
+//
+// The calculation becomes:
+// (dc_sum >> intermediate_descale) * scale) >> final_descale
+// where intermediate_descale is:
+// sum = block_width + block_height
+// intermediate_descale =
+// (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5
+//
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum = block width + block height
+// - Shift 'sum' right until we reach an odd number
+// - Let the number of shifts for that block size be called 'intermediate_scale'
+// and let the odd number be 'd' (d has only 2 possible values: d = 3 for a
+// 1:2 rectangular block and d = 5 for a 1:4 rectangular block).
+// - Find multipliers by dividing by 'd' using "Algorithm 1" in:
+// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+// shift will be 16, regardless of the block size.
+// TODO(jzern): the base implementation could be updated to use this method.
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const int divisor = block_width + block_height;
+ int sum = divisor >> 1; // rounder
+
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ for (int x = 0; x < block_width; ++x) sum += top[x];
+ for (int y = 0; y < block_height; ++y) sum += left[y];
+
+ const int dc = sum / divisor;
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, dc, block_width);
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C directional predictors
+
+// IntraPredFuncs_C::Vertical -- apply top row vertically
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < block_height; ++y) {
+ memcpy(dst, top_row, block_width * sizeof(Pixel));
+ dst += stride;
+ }
+}
+
+// IntraPredFuncs_C::Horizontal -- apply left column horizontally
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, left[y], block_width);
+ dst += stride;
+ }
+}
+
+// IntraPredFuncs_C::Paeth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_left = top[-1];
+ const int top_left_x2 = top_left + top_left;
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ const int left_pixel = left[y];
+ for (int x = 0; x < block_width; ++x) {
+ // The Paeth filter selects the value closest to:
+ // top[x] + left[y] - top_left
+ // To calculate the absolute distance for the left value this would be:
+ // abs((top[x] + left[y] - top_left) - left[y])
+ // or, because left[y] cancels out:
+ // abs(top[x] - top_left)
+ const int left_dist = std::abs(top[x] - top_left);
+ const int top_dist = std::abs(left_pixel - top_left);
+ const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2);
+
+ // Select the closest value to the initial estimate of 'T + L - TL'.
+ if (left_dist <= top_dist && left_dist <= top_left_dist) {
+ dst[x] = left_pixel;
+ } else if (top_dist <= top_left_dist) {
+ dst[x] = top[x];
+ } else {
+ dst[x] = top_left;
+ }
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredBppFuncs_C
+template <int fill, typename Pixel>
+inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width,
+ const int block_height) {
+ static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2,
+ "Only 1 & 2 byte pixels are supported");
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int y = 0; y < block_height; ++y) {
+ Memset(dst, fill, block_width);
+ dst += stride;
+ }
+}
+
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* /*left_column*/) {
+ DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width,
+ block_height);
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct IntraPredDefs {
+ IntraPredDefs() = delete;
+
+ using _4x4 = IntraPredFuncs_C<4, 4, Pixel>;
+ using _4x8 = IntraPredFuncs_C<4, 8, Pixel>;
+ using _4x16 = IntraPredFuncs_C<4, 16, Pixel>;
+ using _8x4 = IntraPredFuncs_C<8, 4, Pixel>;
+ using _8x8 = IntraPredFuncs_C<8, 8, Pixel>;
+ using _8x16 = IntraPredFuncs_C<8, 16, Pixel>;
+ using _8x32 = IntraPredFuncs_C<8, 32, Pixel>;
+ using _16x4 = IntraPredFuncs_C<16, 4, Pixel>;
+ using _16x8 = IntraPredFuncs_C<16, 8, Pixel>;
+ using _16x16 = IntraPredFuncs_C<16, 16, Pixel>;
+ using _16x32 = IntraPredFuncs_C<16, 32, Pixel>;
+ using _16x64 = IntraPredFuncs_C<16, 64, Pixel>;
+ using _32x8 = IntraPredFuncs_C<32, 8, Pixel>;
+ using _32x16 = IntraPredFuncs_C<32, 16, Pixel>;
+ using _32x32 = IntraPredFuncs_C<32, 32, Pixel>;
+ using _32x64 = IntraPredFuncs_C<32, 64, Pixel>;
+ using _64x16 = IntraPredFuncs_C<64, 16, Pixel>;
+ using _64x32 = IntraPredFuncs_C<64, 32, Pixel>;
+ using _64x64 = IntraPredFuncs_C<64, 64, Pixel>;
+};
+
+template <int bitdepth, typename Pixel>
+struct IntraPredBppDefs {
+ IntraPredBppDefs() = delete;
+
+ using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>;
+ using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>;
+ using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>;
+ using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>;
+ using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>;
+ using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>;
+ using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>;
+ using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>;
+ using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>;
+ using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>;
+ using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>;
+ using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>;
+ using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>;
+ using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>;
+ using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>;
+ using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>;
+ using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>;
+ using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>;
+ using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>;
+};
+
+using Defs = IntraPredDefs<uint8_t>;
+using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of
+// the same size.
+#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H) \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] = \
+ DEFSBPP::_##W##x##H::DcFill; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] = \
+ DEFS::_##W##x##H::DcTop; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] = \
+ DEFS::_##W##x##H::DcLeft; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] = \
+ DEFS::_##W##x##H::Dc; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] = \
+ DEFS::_##W##x##H::Vertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
+ DEFS::_##W##x##H::Horizontal; \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \
+ DEFS::_##W##x##H::Paeth
+
+#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
+ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(Defs, Defs8bpp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs8bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ Defs::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ Defs::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Defs::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ Defs::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Defs::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs8bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ Defs::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ Defs::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Defs::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Defs::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Defs::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs8bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ Defs::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ Defs::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ Defs::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Defs::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Defs::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Defs::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs8bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ Defs::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ Defs::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Defs::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ Defs::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Defs::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs8bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ Defs::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ Defs::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Defs::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Defs::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Defs::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs8bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ Defs::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ Defs::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ Defs::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Defs::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ Defs::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Defs::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs8bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ Defs::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ Defs::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ Defs::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Defs::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Defs::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Defs::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs8bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ Defs::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ Defs::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ Defs::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Defs::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ Defs::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Defs::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs8bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ Defs::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ Defs::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ Defs::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Defs::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Defs::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Defs::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs8bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ Defs::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ Defs::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ Defs::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Defs::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ Defs::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Defs::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs8bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ Defs::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ Defs::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ Defs::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Defs::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ Defs::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Defs::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs8bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ Defs::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ Defs::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ Defs::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Defs::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ Defs::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Defs::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs8bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ Defs::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ Defs::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ Defs::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Defs::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ Defs::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Defs::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs8bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ Defs::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ Defs::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ Defs::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Defs::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ Defs::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Defs::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs8bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ Defs::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ Defs::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ Defs::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Defs::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ Defs::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Defs::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs8bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ Defs::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ Defs::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ Defs::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Defs::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Defs::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Defs::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs8bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ Defs::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ Defs::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ Defs::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Defs::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ Defs::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Defs::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs8bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ Defs::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ Defs::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ Defs::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Defs::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ Defs::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Defs::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs8bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ Defs::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ Defs::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ Defs::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Defs::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ Defs::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Defs::_64x64::Paeth;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = IntraPredDefs<uint16_t>;
+using Defs10bpp = IntraPredBppDefs<10, uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs10bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs10bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs10bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs10bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs10bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs10bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs10bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs10bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs10bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs10bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs10bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs10bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs10bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs10bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs10bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs10bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs10bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs10bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs10bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ DefsHbd::_64x64::Paeth;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = IntraPredBppDefs<12, uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs12bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs12bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs12bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs12bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs12bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs12bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs12bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs12bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs12bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs12bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs12bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs12bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs12bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs12bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs12bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs12bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs12bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs12bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs12bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ DefsHbd::_64x64::Paeth;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_INTRAPREDICTORS_WxH
+#undef INIT_INTRAPREDICTORS
+} // namespace
+
+void IntraPredInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
+void IntraPredInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+ kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<Pixel*>(dest);
+ const int dc = dst[0];
+ stride /= sizeof(Pixel);
+ const int max_value = (1 << bitdepth) - 1;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+ assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+ dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+ 0, max_value);
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+ int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int sum = 0;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ const ptrdiff_t luma_x =
+ std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+ const ptrdiff_t luma_x_next = luma_x + stride;
+ luma[y][x] =
+ (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+ ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+ : 0))
+ << (3 - subsampling_x - subsampling_y);
+ sum += luma[y][x];
+ }
+ if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+ src += stride << subsampling_y;
+ }
+ }
+ const int average = RightShiftWithRounding(
+ sum, FloorLog2(block_width) + FloorLog2(block_height));
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ luma[y][x] -= average;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
+ dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
+ CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(12, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+} // namespace
+
+void IntraPredCflInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kCflIntraPredName = "kCflIntraPredictor";
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// CflIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ CflIntraPredTest() = default;
+ CflIntraPredTest(const CflIntraPredTest&) = delete;
+ CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
+ ~CflIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredCflInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_cfl_intra_pred_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredCflInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredCflInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+ if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) {
+ cur_cfl_intra_pred_ = nullptr;
+ }
+ }
+
+ // This test modifies intra_pred_mem_.
+ void TestSpeed(const char* digest, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ CflIntraPredictorFunc base_cfl_intra_pred_;
+ CflIntraPredictorFunc cur_cfl_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+ const int num_runs) {
+ if (cur_cfl_intra_pred_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ const int alpha = rnd(33) - 16;
+ const int dc = rnd(1 << bitdepth);
+ const int max_luma = ((1 << bitdepth) - 1) << 3;
+ for (int i = 0; i < block_height_; ++i) {
+ for (int j = 0; j < block_width_; ++j) {
+ if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) {
+ luma[i][j] = max_luma - rnd(max_luma << 1);
+ }
+ }
+ }
+ for (auto& r : intra_pred_mem_.ref_src) r = dc;
+
+ absl::Duration elapsed_time;
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha);
+ elapsed_time += absl::Now() - start;
+ }
+ test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+ intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_cfl_intra_pred_ == nullptr) return;
+
+ int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+ for (auto& line : luma_buffer) {
+ for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3;
+ }
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ static constexpr int kSaturatedAlpha[] = {-16, 16};
+ for (const int alpha : kSaturatedAlpha) {
+ for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+ << alpha << " differs from reference.";
+ break;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_cfl_intra_pred_ == nullptr) return;
+ int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+
+ const int max_luma = ((1 << bitdepth) - 1) << 3;
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (auto& line : luma_buffer) {
+ for (auto& luma : line) luma = max_luma - rnd(max_luma << 1);
+ }
+ const int dc = rnd(1 << bitdepth);
+ for (auto& r : intra_pred_mem_.ref_src) r = dc;
+ static constexpr int kSaturatedAlpha[] = {-16, 16};
+ for (const int alpha : kSaturatedAlpha) {
+ intra_pred_mem_.Reset(&rnd);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+ << alpha << " differs from reference.";
+ break;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ CflSubsamplerTest() = default;
+ CflSubsamplerTest(const CflSubsamplerTest&) = delete;
+ CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
+ ~CflSubsamplerTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredCflInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_cfl_subsampler_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredCflInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredCflInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+ }
+
+ // This test modifies intra_pred_mem_.
+ void TestSpeed(const char* digest, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ enum SubsamplingType SubsamplingType() const { return subsampling_type; }
+
+ CflSubsamplerFunc base_cfl_subsampler_;
+ CflSubsamplerFunc cur_cfl_subsampler_;
+};
+
+// There is no case where both source and output have lowest height or width
+// when that dimension is subsampled.
+int GetLumaWidth(int block_width, SubsamplingType subsampling_type) {
+ if (block_width == 4) {
+ const int width_shift =
+ static_cast<int>(subsampling_type != kSubsamplingType444);
+ return block_width << width_shift;
+ }
+ return block_width;
+}
+
+int GetLumaHeight(int block_height, SubsamplingType subsampling_type) {
+ if (block_height == 4) {
+ const int height_shift =
+ static_cast<int>(subsampling_type == kSubsamplingType420);
+ return block_height << height_shift;
+ }
+ return block_height;
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed(
+ const char* const digest, const int num_runs) {
+ // C declines initializing the table in normal circumstances because there are
+ // assembly implementations.
+ if (cur_cfl_subsampler_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ const int width = GetLumaWidth(block_width_, subsampling_type);
+ const int height = GetLumaHeight(block_height_, subsampling_type);
+ Pixel* src = intra_pred_mem_.ref_src;
+#if LIBGAV1_MSAN
+ // Quiet 10bpp CflSubsampler420_NEON() msan warning.
+ memset(src, 0, sizeof(intra_pred_mem_.ref_src));
+#endif
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ src[j] = rnd.RandRange(1 << bitdepth);
+ }
+ src += kMaxBlockSize;
+ }
+ const absl::Time start = absl::Now();
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ for (int run = 0; run < num_runs; ++run) {
+ cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+ luma, sizeof(luma), elapsed_time);
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel,
+ subsampling_type>::TestSaturatedValues() {
+ if (base_cfl_subsampler_ == nullptr) return;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+ width -= 8) {
+ for (int height = GetLumaHeight(block_height_, subsampling_type);
+ height > 0; height -= 8) {
+ Pixel* src = intra_pred_mem_.ref_src;
+ for (int y = 0; y < height; ++y) {
+ Memset(src, (1 << bitdepth) - 1, width);
+ Memset(src + width, 0, kMaxBlockSize - width);
+ src += kMaxBlockSize;
+ }
+ Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+ kMaxBlockSize * (kMaxBlockSize - height));
+
+ int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+ stride);
+ cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+ stride);
+ if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+ reinterpret_cast<uint16_t*>(luma_base[0]),
+ block_width_, block_height_,
+ kCflLumaBufferStride, kCflLumaBufferStride,
+ true)) {
+ FAIL() << "Result from optimized version of CFL subsampler"
+ << " differs from reference. max_luma_width: " << width
+ << " max_luma_height: " << height;
+ }
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() {
+ if (base_cfl_subsampler_ == nullptr) return;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+ width -= 8) {
+ for (int height = GetLumaHeight(block_height_, subsampling_type);
+ height > 0; height -= 8) {
+ Pixel* src = intra_pred_mem_.ref_src;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ src[j] = rnd.RandRange(1 << bitdepth);
+ }
+ Memset(src + width, 0, kMaxBlockSize - width);
+ src += kMaxBlockSize;
+ }
+ Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+ kMaxBlockSize * (kMaxBlockSize - height));
+
+ int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+ stride);
+ cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+ stride);
+ if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+ reinterpret_cast<uint16_t*>(luma_base[0]),
+ block_width_, block_height_,
+ kCflLumaBufferStride, kCflLumaBufferStride,
+ true)) {
+ FAIL() << "Result from optimized version of CFL subsampler"
+ << " differs from reference. max_luma_width: " << width
+ << " max_luma_height: " << height;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>;
+
+const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed";
+ static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d";
+ static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec";
+ static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8";
+ static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5";
+ static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c";
+ static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c";
+ static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6";
+ static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778";
+ static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3";
+ static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd";
+ static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0";
+ static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36";
+ static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest8bpp444 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType444>;
+using CflSubsamplerTest8bpp422 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType422>;
+using CflSubsamplerTest8bpp420 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b",
+ "1d03f091956838e7f2b113aabd8b9da9"};
+ static const char* const kDigests4x8[3] = {
+ "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d",
+ "68a334f5d2abecbc78562b3280b5fb0c"};
+ static const char* const kDigests4x16[3] = {
+ "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91",
+ "b0600f0bc3fbfc374bb3628360dcae5c"};
+ static const char* const kDigests8x4[3] = {
+ "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e",
+ "3522d3a4dd3839d1a86fb39b31a86d52"};
+ static const char* const kDigests8x8[3] = {
+ "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf",
+ "082e34ba04d04d7cd6fe408823987602"};
+ static const char* const kDigests8x16[3] = {
+ "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a",
+ "d8d266282cb7123f780bd7266e8f5913"};
+ static const char* const kDigests8x32[3] = {
+ "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff",
+ "d294fb6399edaa267aa167407c0ebccb"};
+ static const char* const kDigests16x4[3] = {
+ "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64",
+ "a928ebbec2db1508c8831a440d82eb98"};
+ static const char* const kDigests16x8[3] = {
+ "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b",
+ "c8e71c2fddbb850c5a50592ee5975368"};
+ static const char* const kDigests16x16[3] = {
+ "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd",
+ "2ffeed4d592a0455f6d888913969827f"};
+ static const char* const kDigests16x32[3] = {
+ "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883",
+ "185ca976ccbef7fb5f3f8c6aa22d5a79"};
+ static const char* const kDigests32x8[3] = {
+ "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20",
+ "9847d88eaaa57c086a2e6aed583048d3"};
+ static const char* const kDigests32x16[3] = {
+ "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc",
+ "a4189655fe714b82eb88cb5092c0ad76"};
+ static const char* const kDigests32x32[3] = {
+ "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88",
+ "db200bc8ccbeacd6a42d6b8e5ad1d931"};
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
+
+const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d";
+ static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b";
+ static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd";
+ static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90";
+ static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0";
+ static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe";
+ static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d";
+ static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7";
+ static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef";
+ static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7";
+ static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79";
+ static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1";
+ static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74";
+ static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest10bpp444 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest10bpp422 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest10bpp420 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451",
+ "93d1d9df2861240d88f5618e42178654"};
+ static const char* const kDigests4x8[3] = {
+ "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27",
+ "cf88b6d1b7b025cfa0082361775aeb75"};
+ static const char* const kDigests4x16[3] = {
+ "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c",
+ "0884a0e53384cd5173035ad8966d8f2f"};
+ static const char* const kDigests8x4[3] = {
+ "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18",
+ "8070668aa389c1d09f8aaf43c1223e8c"};
+ static const char* const kDigests8x8[3] = {
+ "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193",
+ "4a1110382e56b42d3b7a4132bccc01ee"};
+ static const char* const kDigests8x16[3] = {
+ "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30",
+ "bd012ca1cea256dd02c231339a4cf200"};
+ static const char* const kDigests8x32[3] = {
+ "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842",
+ "71888b17b832ef55c0cd9449c0e6b077"};
+ static const char* const kDigests16x4[3] = {
+ "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96",
+ "917cd884923139d5c05a11000722e3b6"};
+ static const char* const kDigests16x8[3] = {
+ "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9",
+ "edded6d166a77a6c3ff46fddc13f372f"};
+ static const char* const kDigests16x16[3] = {
+ "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a",
+ "2f1a91898812d2c9320c7506b3a72eb4"};
+ static const char* const kDigests16x32[3] = {
+ "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796",
+ "e44e18cb8bda2d34b52c96d5b6b510be"};
+ static const char* const kDigests32x8[3] = {
+ "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f",
+ "d12746071bee96ddc075c6368bc9fbaf"};
+ static const char* const kDigests32x16[3] = {
+ "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7",
+ "02513142d109aee10f081cacfb33d1c5"};
+ static const char* const kDigests32x32[3] = {
+ "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f",
+ "12d13828db57605b00ce99469489651d"};
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CflIntraPredTest12bpp = CflIntraPredTest<12, uint16_t>;
+
+const char* GetCflIntraPredDigest12bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "1d92a681a58f99396f22acd8b3154e2b";
+ static const char* const kDigest4x8 = "cf6833ebc64c9ae45f192ee384ef4aa3";
+ static const char* const kDigest4x16 = "06a4fbb8590aca98a045c902ed15c777";
+ static const char* const kDigest8x4 = "ad5944c7455f731ae8dd28b2b25a1b9f";
+ static const char* const kDigest8x8 = "c19621e42ca2bc184d5065131d27be2c";
+ static const char* const kDigest8x16 = "8faa7c95e8c3c18621168ed6759c1ac1";
+ static const char* const kDigest8x32 = "502699ef7a8c7aebc8c3bc653e733703";
+ static const char* const kDigest16x4 = "7f30bb038217967336fb8548a6f7df45";
+ static const char* const kDigest16x8 = "b70943098d0fb256c2943e2ebdbe6d34";
+ static const char* const kDigest16x16 = "4c34f5669880ab78d648b16b68ea0c24";
+ static const char* const kDigest16x32 = "5d85daf690020ed235617870a1a179b1";
+ static const char* const kDigest32x8 = "f8eec12e58c469ffb698fc60b13b927c";
+ static const char* const kDigest32x16 = "f272bb7e5d2df333aa63d806c95e6748";
+ static const char* const kDigest32x32 = "c737987c0a5414b03e6014f145dd999c";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest12bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest12bpp444 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest12bpp422 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest12bpp420 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest12bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "44af37c60e9ccaacea004b57d5dea4cf",
+ "e29dd1d93f23b23778ed8cd85910d987",
+ "81e5dac2fd4c90f872ab814ed0f76ae5",
+ };
+ static const char* const kDigests4x8[3] = {
+ "bfc04aed9fe41ec07b0462a219652d16",
+ "693dd064636a0aa3be7aa098e867c512",
+ "0636c25d88aacd85d63e56011e7c5d15",
+ };
+ static const char* const kDigests4x16[3] = {
+ "6479ab30377288e75a78068d47c7e194",
+ "7d6f9b8b3eb85e73626118fc9210e622",
+ "1f3d474cd7c86899da90e515b8b7a906",
+ };
+ static const char* const kDigests8x4[3] = {
+ "7da5a2029bcdab159225c475fdff02da",
+ "096bfef24caa0670d2cd7b0bb63a7ba6",
+ "f749310dfc8a6129ed438dbc845470c0",
+ };
+ static const char* const kDigests8x8[3] = {
+ "08494051a7ff50718313a79ec7c51f92",
+ "637efad0630e253f7cce11af1a0af456",
+ "b220faf7dfedef860d59079dcf201757",
+ };
+ static const char* const kDigests8x16[3] = {
+ "19f027af516e88d3b9e613e578deb126",
+ "4f3bb155d70f9ea76d05b2f41b297a0c",
+ "b7504347eeda1e59ba8e36385c219e40",
+ };
+ static const char* const kDigests8x32[3] = {
+ "b8f1ef01c5672c87ee1004bb3cd7b8bc",
+ "b3e3318b050eb1c165d1e320ef622fa7",
+ "67754f7c5ae84dc23bb76ffaa2fa848e",
+ };
+ static const char* const kDigests16x4[3] = {
+ "f687fb4e22d8a1446eeb4915036874f4",
+ "7b5ef3d393a98dfe0ba49a0db2083465",
+ "840bbb6edaa50e9f7d391033a3dda2d9",
+ };
+ static const char* const kDigests16x8[3] = {
+ "dd9aed11d115a028035f0cee5b90d433",
+ "340d5d0784356ea199d3d751f4d6ed5e",
+ "e55f6fb5f34d829727e9dc2068098933",
+ };
+ static const char* const kDigests16x16[3] = {
+ "1df36a20d76a405c6273b88b38693cf9",
+ "2a7590d01df60b4bc6f10bfdb07b7a65",
+ "510ee31a5bd609e8f4542bb817539668",
+ };
+ static const char* const kDigests16x32[3] = {
+ "bdbc13b9fb7c3c50d25fda57f86f5ad9",
+ "7c138c568794b3d0c8aabff2edc07efd",
+ "581bef267c2a66e4c2fb079968440dbe",
+ };
+ static const char* const kDigests32x8[3] = {
+ "26f62743793811475e2afe1414c5fee1",
+ "6e6bf1678a04f2f727f0679564fb3630",
+ "a4c15562c26dbcfa43fe03a2b6e728b5",
+ };
+ static const char* const kDigests32x16[3] = {
+ "791f0713bbf032081da8ec08e58b9cd3",
+ "5dc7a673e92767186ae86996f4a30691",
+ "651f09d1244c817d92d1baa094c86f56",
+ };
+ static const char* const kDigests32x32[3] = {
+ "543a9d76e7238d88ba86218ec47c1f49",
+ "b0f2b29aae4858c1f09c27fc4344fd15",
+ "1d45083875fed14c4e5f149384a3cd2d",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest12bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+// Cfl predictors are available only for transform sizes with
+// max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+ kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const int width,
+ const int height, const int xstep, const bool upsampled_top) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+
+ // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+ // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+ if (xstep == 64) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+ // |predictor_angle| is 45 the delta is also 45.
+ assert(!upsampled_top);
+ const Pixel* top_ptr = top + 1;
+ for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+ memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+ }
+ return;
+ }
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_x], width);
+ dst += stride;
+ }
+ return;
+ }
+
+ const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ int x = 0;
+ do {
+ if (top_base_x >= max_base_x) {
+ Memset(dst + x, top[max_base_x], width - x);
+ break;
+ }
+
+ const int val =
+ top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+ top_base_x += base_step;
+ } while (++x < width);
+
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT.
+// https://github.com/llvm/llvm-project/issues/54427
+#if defined(__clang__) && __clang_major__ == 14
+#define LOCAL_RESTRICT
+#else
+#define LOCAL_RESTRICT LIBGAV1_RESTRICT
+#endif
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(
+ void* LOCAL_RESTRICT const dest, ptrdiff_t stride,
+ const void* LOCAL_RESTRICT const top_row,
+ const void* LOCAL_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+ assert(ystep > 0);
+
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int scale_bits_x = 6 - upsample_top_shift;
+ const int scale_bits_y = 6 - upsample_left_shift;
+ const int min_base_x = -(1 << upsample_top_shift);
+ const int base_step_x = 1 << upsample_top_shift;
+ int y = 0;
+ int top_x = -xstep;
+ do {
+ int top_base_x = top_x >> scale_bits_x;
+ int left_y = (y << 6) - ystep;
+ int x = 0;
+ do {
+ int val;
+ if (top_base_x >= min_base_x) {
+ const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+ val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ } else {
+ // Note this assumes an arithmetic shift to handle negative values.
+ const int left_base_y = left_y >> scale_bits_y;
+ const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+ assert(left_base_y >= -(1 << upsample_left_shift));
+ val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ }
+ dst[x] = RightShiftWithRounding(val, 5);
+ top_base_x += base_step_x;
+ left_y -= ystep;
+ } while (++x < width);
+
+ top_x -= xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+#undef LOCAL_RESTRICT
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int ystep, const bool upsampled_left) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ stride /= sizeof(Pixel);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ base_step * (height - 1)); // left_base_y
+
+ int left_y = ystep;
+ int x = 0;
+ do {
+ auto* dst = static_cast<Pixel*>(dest);
+
+ int left_base_y = left_y >> scale_bits;
+ int y = 0;
+ do {
+ const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const int val =
+ left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5);
+ dst += stride;
+ left_base_y += base_step;
+ } while (++y < height);
+
+ left_y += ystep;
+ } while (++x < width);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void IntraPredDirectionalInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+constexpr int kNumDirectionalIntraPredictors = 3;
+
+constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203};
+
+const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = {
+ "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2",
+ "kDirectionalIntraPredictorZone3"};
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+ EXPECT_GE(angle, 3);
+ EXPECT_LE(angle, 87);
+ return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+#if LIBGAV1_MSAN
+ // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+ // assembly code (safely) overreading to fill a register.
+ memset(left_mem, 0, sizeof(left_mem));
+ memset(top_mem, 0, sizeof(top_mem));
+#endif // LIBGAV1_MSAN
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+#if LIBGAV1_MSAN
+ // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+ // assembly code (safely) overreading to fill a register.
+ memset(left_mem, 0, sizeof(left_mem));
+ memset(top_mem, 0, sizeof(top_mem));
+#endif // LIBGAV1_MSAN
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// DirectionalIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ DirectionalIntraPredTest() = default;
+ DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
+ DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
+ ~DirectionalIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ enum Zone { kZone1, kZone2, kZone3, kNumZones };
+
+ enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 };
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredDirectionalInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+ base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+ base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_directional_intra_pred_zone1_ = nullptr;
+ base_directional_intra_pred_zone2_ = nullptr;
+ base_directional_intra_pred_zone3_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredDirectionalInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredDirectionalInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+ cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+ cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+ // Skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_directional_intra_pred_zone1_ ==
+ base_directional_intra_pred_zone1_) {
+ cur_directional_intra_pred_zone1_ = nullptr;
+ }
+ if (cur_directional_intra_pred_zone2_ ==
+ base_directional_intra_pred_zone2_) {
+ cur_directional_intra_pred_zone2_ = nullptr;
+ }
+ if (cur_directional_intra_pred_zone3_ ==
+ base_directional_intra_pred_zone3_) {
+ cur_directional_intra_pred_zone3_ = nullptr;
+ }
+ }
+
+ bool IsEdgeUpsampled(int delta, const int filter_type) const {
+ delta = std::abs(delta);
+ if (delta == 0 || delta >= 40) return false;
+ const int block_wh = block_width_ + block_height_;
+ return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16;
+ }
+
+ // Returns the minimum and maximum (exclusive) range of angles that the
+ // predictor should be applied to.
+ void GetZoneAngleRange(const Zone zone, int* const min_angle,
+ int* const max_angle) const {
+ ASSERT_NE(min_angle, nullptr);
+ ASSERT_NE(max_angle, nullptr);
+ switch (zone) {
+ // The overall minimum angle comes from mode D45_PRED, yielding:
+ // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36
+ // The overall maximum angle comes from mode D203_PRED, yielding:
+ // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212
+ // The angles 180 and 90 are not permitted because they correspond to
+ // V_PRED and H_PRED, which are handled in distinct functions.
+ case kZone1:
+ *min_angle = 36;
+ *max_angle = 87;
+ break;
+ case kZone2:
+ *min_angle = 93;
+ *max_angle = 177;
+ break;
+ case kZone3:
+ *min_angle = 183;
+ *max_angle = 212;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << zone;
+ break;
+ }
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors],
+ Zone zone, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_;
+ DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_;
+ DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_;
+ DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_;
+ DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_;
+ DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_;
+};
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumDirectionalIntraPredictors], const Zone zone,
+ const int num_runs) {
+ switch (zone) {
+ case kZone1:
+ if (cur_directional_intra_pred_zone1_ == nullptr) return;
+ break;
+ case kZone2:
+ if (cur_directional_intra_pred_zone2_ == nullptr) return;
+ break;
+ case kZone3:
+ if (cur_directional_intra_pred_zone3_ == nullptr) return;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << zone;
+ break;
+ }
+ ASSERT_NE(digests, nullptr);
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ // Allocate separate blocks for each angle + filter + upsampled combination.
+ // Add a 1 pixel right border to test for overwrites.
+ static constexpr int kMaxZoneAngles = 27; // zone 2
+ static constexpr int kMaxFilterTypes = 2;
+ static constexpr int kBlockBorder = 1;
+ static constexpr int kBorderSize =
+ kBlockBorder * kMaxZoneAngles * kMaxFilterTypes;
+ const int ref_stride =
+ kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize;
+ const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_;
+
+ using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>;
+ AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+ &AlignedFree);
+ AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+ &AlignedFree);
+ ASSERT_NE(ref_src, nullptr);
+ ASSERT_NE(dest, nullptr);
+
+ const int mask = (1 << bitdepth) - 1;
+ for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) {
+ ref_src[i] = rnd.Rand16() & mask;
+ }
+
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle));
+
+ absl::Duration elapsed_time;
+ for (int run = 0; run < num_runs; ++run) {
+ Pixel* dst = dest.get();
+ memcpy(dst, ref_src.get(), ref_alloc_size);
+ for (const auto& base_angle : kBaseAngles) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+ angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle < min_angle || predictor_angle > max_angle) {
+ continue;
+ }
+
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]);
+ if (predictor_angle < 90) {
+ ASSERT_EQ(zone, kZone1);
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone1_(dst, stride, top, block_width_,
+ block_height_, xstep,
+ upsampled_top);
+ elapsed_time += absl::Now() - start;
+ } else if (predictor_angle < 180) {
+ ASSERT_EQ(zone, kZone2);
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone2_(
+ dst, stride, top, left, block_width_, block_height_, xstep,
+ ystep, upsampled_top, upsampled_left);
+ elapsed_time += absl::Now() - start;
+ } else {
+ ASSERT_EQ(zone, kZone3);
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone3_(dst, stride, left, block_width_,
+ block_height_, ystep,
+ upsampled_left);
+ elapsed_time += absl::Now() - start;
+ }
+ dst += block_width_ + kBlockBorder;
+ }
+ }
+ }
+ }
+
+ test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone],
+ digests[zone], dest.get(), ref_alloc_size,
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ for (int i = kZone1; i < kNumZones; ++i) {
+ switch (i) {
+ case kZone1:
+ if (cur_directional_intra_pred_zone1_ == nullptr) continue;
+ break;
+ case kZone2:
+ if (cur_directional_intra_pred_zone2_ == nullptr) continue;
+ break;
+ case kZone3:
+ if (cur_directional_intra_pred_zone3_ == nullptr) continue;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << i;
+ break;
+ }
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(
+ GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+ for (const auto& base_angle : kBaseAngles) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+ angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+ continue;
+ }
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ if (predictor_angle < 90) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top,
+ block_width_, block_height_,
+ xstep, upsampled_top);
+ } else if (predictor_angle < 180) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ cur_directional_intra_pred_zone2_(
+ intra_pred_mem_.dst, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ } else {
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left,
+ block_width_, block_height_,
+ ystep, upsampled_left);
+ }
+
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << kDirectionalPredNames[i]
+ << " (angle: " << predictor_angle
+ << " filter type: " << filter_type
+ << ") to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+
+ for (int i = kZone1; i < kNumZones; ++i) {
+ // Only run when there is a reference version (base) and a different
+ // optimized version (cur).
+ switch (i) {
+ case kZone1:
+ if (base_directional_intra_pred_zone1_ == nullptr ||
+ cur_directional_intra_pred_zone1_ == nullptr) {
+ continue;
+ }
+ break;
+ case kZone2:
+ if (base_directional_intra_pred_zone2_ == nullptr ||
+ cur_directional_intra_pred_zone2_ == nullptr) {
+ continue;
+ }
+ break;
+ case kZone3:
+ if (base_directional_intra_pred_zone3_ == nullptr ||
+ cur_directional_intra_pred_zone3_ == nullptr) {
+ continue;
+ }
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << i;
+ break;
+ }
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(
+ GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+ for (const auto& base_angle : kBaseAngles) {
+ for (int n = 0; n < 1000; ++n) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart;
+ angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+ continue;
+ }
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+
+ intra_pred_mem_.Reset(&rnd);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ if (predictor_angle < 90) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ base_directional_intra_pred_zone1_(
+ intra_pred_mem_.ref_src, stride, top, block_width_,
+ block_height_, xstep, upsampled_top);
+ cur_directional_intra_pred_zone1_(
+ intra_pred_mem_.dst, stride, top, block_width_, block_height_,
+ xstep, upsampled_top);
+ } else if (predictor_angle < 180) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ base_directional_intra_pred_zone2_(
+ intra_pred_mem_.ref_src, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ cur_directional_intra_pred_zone2_(
+ intra_pred_mem_.dst, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ } else {
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ base_directional_intra_pred_zone3_(
+ intra_pred_mem_.ref_src, stride, left, block_width_,
+ block_height_, ystep, upsampled_left);
+ cur_directional_intra_pred_zone3_(
+ intra_pred_mem_.dst, stride, left, block_width_,
+ block_height_, ystep, upsampled_left);
+ }
+
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << kDirectionalPredNames[i]
+ << " differs from reference at angle "
+ << predictor_angle << " with filter type "
+ << filter_type << " in iteration #" << n;
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>;
+
+const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "9cfc1da729ad08682e165826c29b280b",
+ "bb73539c7afbda7bddd2184723b932d6",
+ "9d2882800ffe948196e984a26a2da72c",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "090efe6f83cc6fa301f65d3bbd5c38d2",
+ "d0fba4cdfb90f8bd293a94cae9db1a15",
+ "f7ad0eeab4389d0baa485d30fec87617",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "1d32b33c75fe85248c48cdc8caa78d84",
+ "7000e18159443d366129a6cc6ef8fcee",
+ "06c02fac5f8575f687abb3f634eb0b4c",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "1b591799685bc135982114b731293f78",
+ "5cd9099acb9f7b2618dafa6712666580",
+ "d023883efede88f99c19d006044d9fa1",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "f1e46ecf62a2516852f30c5025adb7ea",
+ "864442a209c16998065af28d8cdd839a",
+ "411a6e554868982af577de69e53f12e8",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "89278302be913a85cfb06feaea339459",
+ "6c42f1a9493490cd4529fd40729cec3c",
+ "2516b5e1c681e5dcb1acedd5f3d41106",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "aea7078f3eeaa8afbfe6c959c9e676f1",
+ "cad30babf12729dda5010362223ba65c",
+ "ff384ebdc832007775af418a2aae1463",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "964a821c313c831e12f4d32e616c0b55",
+ "adf6dad3a84ab4d16c16eea218bec57a",
+ "a54fa008d43895e523474686c48a81c2",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "fe2851b4e4f9fcf924cf17d50415a4c0",
+ "50a0e279c481437ff315d08eb904c733",
+ "0682065c8fb6cbf9be4949316c87c9e5",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "ef15503b1943642e7a0bace1616c0e11",
+ "bf1a4d3f855f1072a902a88ec6ce0350",
+ "7e87a03e29cd7fd843fd71b729a18f3f",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "f7b636615d2e5bf289b5db452a6f188d",
+ "e95858c532c10d00b0ce7a02a02121dd",
+ "34a18ccf58ef490f32268e85ce8c7de4",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "b250099986c2fab9670748598058846b",
+ "f25d80af4da862a9b6b72979f1e17cb4",
+ "5347dc7bc346733b4887f6c8ad5e0898",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "72e4c9f8af043b1cb1263490351818ab",
+ "1fc010d2df011b9e4e3d0957107c78df",
+ "f4cbfa3ca941ef08b972a68d7e7bafc4",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "37e5a1aaf7549d2bce08eece9d20f0f6",
+ "6a2794025d0aca414ab17baa3cf8251a",
+ "63dd37a6efdc91eeefef166c99ce2db1",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "198aabc958992eb49cceab97d1acb43e",
+ "aee88b6c8bacfcf38799fe338e6c66e7",
+ "01e8f8f96696636f6d79d33951907a16",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "0611390202c4f90f7add7aec763ded58",
+ "960240c7ceda2ccfac7c90b71460578a",
+ "7e7d97594aab8ad56e8c01c340335607",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "7e1f567e7fc510757f2d89d638bc826f",
+ "c929d687352ce40a58670be2ce3c8c90",
+ "f6881e6a9ba3c3d3d730b425732656b1",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "27b4c2a7081d4139f22003ba8b6dfdf2",
+ "301e82740866b9274108a04c872fa848",
+ "98d3aa4fef838f4abf00dac33806659f",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "b31816db8fade3accfd975b21aa264c7",
+ "2adce01a03b9452633d5830e1a9b4e23",
+ "7b988fadba8b07c36e88d7be6b270494",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+ const auto num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
+#else
+ const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "a683f4d7ccd978737615f61ecb4d638d",
+ "90c94374eaf7e9501f197863937b8639",
+ "0d3969cd081523ac6a906eecc7980c43",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "c3ffa2979b325644e4a56c882fe27347",
+ "1f61f5ee413a9a3b8d1d93869ec2aee0",
+ "4795ea944779ec4a783408769394d874",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "45c3282c9aa51024c1d64a40f230aa45",
+ "5cd47dd69f8bd0b15365a0c5cfc0a49a",
+ "06336c507b05f98c1d6a21abc43e6182",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "7370476ff0abbdc5e92f811b8879c861",
+ "a239a50adb28a4791b52a0dfff3bee06",
+ "4779a17f958a9ca04e8ec08c5aba1d36",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "305463f346c376594f82aad8304e0362",
+ "0cd481e5bda286c87a645417569fd948",
+ "48c7899dc9b7163b0b1f61b3a2b4b73e",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "5c18fd5339be90628c82b1fb6af50d5e",
+ "35eaa566ebd3bb7c903cfead5dc9ac78",
+ "9fdb0e790e5965810d02c02713c84071",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "2168d6cc858c704748b7b343ced2ac3a",
+ "1d3ce273107447faafd2e55877e48ffb",
+ "d344164049d1fe9b65a3ae8764bbbd37",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "dcef2cf51abe3fe150f388a14c762d30",
+ "6a810b289b1c14f8eab8ca1274e91ecd",
+ "c94da7c11f3fb11963d85c8804fce2d9",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "50a0d08b0d99b7a574bad2cfb36efc39",
+ "2dcb55874db39da70c8ca1318559f9fe",
+ "6390bcd30ff3bc389ecc0a0952bea531",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "7146c83c2620935606d49f3cb5876f41",
+ "2318ddf30c070a53c9b9cf199cd1b2c5",
+ "e9042e2124925aa7c1b6110617cb10e8",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "c970f401de7b7c5bb4e3ad447fcbef8f",
+ "a18cc70730eecdaa31dbcf4306ff490f",
+ "32c1528ad4a576a2210399d6b4ccd46e",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "00b3f0007da2e5d01380594a3d7162d5",
+ "1971af519e4a18967b7311f93efdd1b8",
+ "e6139769ce5a9c4982cfab9363004516",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "08107ad971179cc9f465ae5966bd4901",
+ "b215212a3c0dfe9182c4f2e903d731f7",
+ "791274416a0da87c674e1ae318b3ce09",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "94ea6cccae35b5d08799aa003ac08ccf",
+ "ae105e20e63fb55d4fd9d9e59dc62dde",
+ "973d0b2358ea585e4f486e7e645c5310",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "d14c695c4853ddf5e5d8256bc1d1ed60",
+ "6bd0ebeb53adecc11442b1218b870cb7",
+ "e03bc402a9999aba8272275dce93e89f",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "b21a8a8723758392ee659eeeae518a1e",
+ "e50285454896210ce44d6f04dfde05a7",
+ "f0f8ea0c6c2acc8d7d390927c3a90370",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "ce51db16fd4fa56e601631397b098c89",
+ "aa87a8635e02c1e91d13158c61e443f6",
+ "4c1ee3afd46ef34bd711a34d0bf86f13",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "25aaf5971e24e543e3e69a47254af777",
+ "eb6f444b3df127d69460778ab5bf8fc1",
+ "2f846cc0d506f90c0a58438600819817",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "b26ce5b5f4b5d4a438b52e5987877fb8",
+ "35721a00a70938111939cf69988d928e",
+ "0af7ec35939483fac82c246a13845806",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+ const int num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
+#else
+ const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DirectionalIntraPredTest12bpp = DirectionalIntraPredTest<12, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "78f3297743f75e928e755b6ffa2d3050",
+ "7315da39861c6e3ef2e47c913e3be349",
+ "5609cb40b575f24d05880df202a60bd3",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "efb2363d3c25427abe198806c8ba4d6b",
+ "b5aaa41665a10e7e7944fb7fc90fd59a",
+ "5a85610342339ca3109d775fa18dc25c",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "9045679914980ea1f579d84509397b6e",
+ "f9f50bdc9f81a93095fd9d6998174aa7",
+ "46c1f82e85b8ba5b03bab41a2f561483",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "a0ae0956b2b667c528b7803d733d49da",
+ "5d9f60ef8904c4faedb6cfc19e54418a",
+ "4ffdcbbbcb23bca8286f1c286b9cb3e8",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "086116c6b116613b8b47a086726566ea",
+ "141dca7fcae0e4d4b88887a618271ea1",
+ "3575a34278aa0fb1eed934290982f4a7",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "7922f40216c78a40abaf675667e79493",
+ "55d20588240171df2e24d105ee1563ad",
+ "674b4d8f4dbf514d22e21cc4baeda1d3",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "32d4d7e256d3b304026ddb5430cf6a09",
+ "72f4be2569f4e067c252d51ff4030de3",
+ "6779a132e1bac0ac43c2373f56553ed8",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "1be2e0efc1403f9e22cfb8aeb28763d9",
+ "558c8a5418ac91d21a5839c454a9391f",
+ "7693ebef9b86416ebd6e78e98fcafba7",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "e6217ed1c673ae42e84f8757316b580d",
+ "028aa582c11a9733f0cd693211a067c5",
+ "082de9fc7c4bc80a8ec8522b5a5cb52c",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "e3b293c09bdc9c5c543ad046a3f0d64f",
+ "2de5803a6ed497c1039c8e6d675c1dd3",
+ "05742f807560f5d5206e54b70097dc4a",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "57f2ca4ba56be253eff7e6b73df5003d",
+ "ef8bea00437e01fb798a22cda59f0191",
+ "989ff38c96600c2f108d6e6fa381fd13",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "f5540f4874c02aa2222a3ba75106f841",
+ "17e5d20f798a96c39abc8a81e7aa7bc6",
+ "0fe9ea14c9dcae466b4a36f1c7db6978",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "aff9429951ab1885c0d9ed29aa1b6a9f",
+ "4b686e2a879bf0b4aadd06b412e0eb48",
+ "39325d71cddc272bfa1dd2dc80d09ffe",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "b83dffdf8bad2b7c3808925b6138ca1e",
+ "3656b58c7aaf2025979b4a3ed8a2841e",
+ "cfcc0c6ae3fa5e7d45dec581479459f6",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "3c91b3b9e2df73ffb718e0bf53c5a5c2",
+ "0dbe27603e111158e70d99e181befb83",
+ "edecbffb32ae1e49b66b6e55ad0af6c6",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "a3290917f755c7ccdc7b77eb3c6c89a7",
+ "42f89db41fbb366ddb78ef79a043f3e3",
+ "7f7bcbe33aa003b166677c68d12490e9",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "d4f4c6b70a82695f843e9227bd7d9cc8",
+ "550a0bd87936801651d552e229b683e9",
+ "a4c730ad71f566a930c5672e1b2f48f1",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "2087c9264c4c5fea9a6fe20dcedbe2b9",
+ "d4dd51d9578a3fc2eb75086fba867c22",
+ "6121a67d63e40107e780d0938aeb3d21",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "09c3818a07bc54467634c2bfce66f58f",
+ "8da453b8d72d73d71ba15a14ddd59db4",
+ "9bc939aa54445722469b120b8a505cb3",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+ const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+ const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+constexpr TransformSize kTransformSizes[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column,
+ const FilterIntraPredictor pred, const int width,
+ const int height) {
+ const int kMaxPixel = (1 << bitdepth) - 1;
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
+ memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ int row0 = 0, row2 = 2;
+ int ystep = 1;
+ int y = 0;
+ do {
+ buffer[1][0] = left[y];
+ buffer[row2][0] = left[y + 1];
+ int x = 1;
+ do {
+ const Pixel p0 = buffer[row0][x - 1]; // top-left
+ const Pixel p1 = buffer[row0][x + 0]; // top 0
+ const Pixel p2 = buffer[row0][x + 1]; // top 1
+ const Pixel p3 = buffer[row0][x + 2]; // top 2
+ const Pixel p4 = buffer[row0][x + 3]; // top 3
+ const Pixel p5 = buffer[1][x - 1]; // left 0
+ const Pixel p6 = buffer[row2][x - 1]; // left 1
+ for (int i = 0; i < 8; ++i) {
+ const int xoffset = i & 0x03;
+ const int yoffset = (i >> 2) * ystep;
+ const int value = kFilterIntraTaps[pred][i][0] * p0 +
+ kFilterIntraTaps[pred][i][1] * p1 +
+ kFilterIntraTaps[pred][i][2] * p2 +
+ kFilterIntraTaps[pred][i][3] * p3 +
+ kFilterIntraTaps[pred][i][4] * p4 +
+ kFilterIntraTaps[pred][i][5] * p5 +
+ kFilterIntraTaps[pred][i][6] * p6;
+ // Section 7.11.2.3 specifies the right-hand side of the assignment as
+ // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+ // Since Clip1() clips a negative value to 0, it is safe to replace
+ // Round2Signed() with Round2().
+ buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+ }
+ x += 4;
+ } while (x < width);
+ memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+ dst += stride;
+
+ // The final row becomes the top for the next pass.
+ row0 ^= 2;
+ row2 ^= 2;
+ ystep = -ystep;
+ y += 2;
+ } while (y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void IntraPredFilterInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = {
+ "kFilterIntraPredictorDc", "kFilterIntraPredictorVertical",
+ "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157",
+ "kFilterIntraPredictorPaeth",
+};
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// FilterIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ FilterIntraPredTest() = default;
+ FilterIntraPredTest(const FilterIntraPredTest&) = delete;
+ FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
+ ~FilterIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredFilterInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_filter_intra_pred_ = dsp->filter_intra_predictor;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ // No need to compare C with itself.
+ base_filter_intra_pred_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredFilterInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredFilterInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ // Put the current architecture-specific implementation up for testing and
+ // comparison against C version.
+ cur_filter_intra_pred_ = dsp->filter_intra_predictor;
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumFilterIntraPredictors],
+ int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ FilterIntraPredictorFunc base_filter_intra_pred_;
+ FilterIntraPredictorFunc cur_filter_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumFilterIntraPredictors], const int num_runs) {
+ ASSERT_NE(digests, nullptr);
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ // IntraPredInit_C() leaves the filter function empty.
+ if (cur_filter_intra_pred_ == nullptr) return;
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i],
+ digests[i], intra_pred_mem_.dst,
+ sizeof(intra_pred_mem_.dst), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ Pixel* const left = intra_pred_mem_.left_mem + 16;
+ Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ // IntraPredInit_C() leaves the filter function empty.
+ if (cur_filter_intra_pred_ == nullptr) return;
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i]
+ << " to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_filter_intra_pred_ == nullptr) return;
+
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ // It may be worthwhile to temporarily increase this loop size when testing
+ // changes that specifically affect this test.
+ for (int n = 0; n < 10000; ++n) {
+ intra_pred_mem_.Reset(&rnd);
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left,
+ static_cast<FilterIntraPredictor>(i),
+ block_width_, block_height_);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << kFilterIntraPredNames[i]
+ << " differs from reference in iteration #" << n;
+ break;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>;
+
+const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d",
+ "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5",
+ "2d3711616c8d8798f608e313cb07a72a",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06",
+ "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87",
+ "750dbfe3bc5508b7031957a1d315b8bc",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf",
+ "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1",
+ "976273745b94a561fd52f5aa96fb280f",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac",
+ "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c",
+ "eab59a3710d9bdeca8fa03a15d3f95d6",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b",
+ "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070",
+ "918c2553635763a0756b20154096bca6",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142",
+ "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13",
+ "893e6995522e23ed3d613ef3797ca580",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa",
+ "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f",
+ "00c909b749e36142b133a7357271e83e",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7",
+ "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666",
+ "9e350d1cd65d520194281633f566810d",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1",
+ "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e",
+ "2f37e586769dfb88d9d4116b9c28c5ab",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005",
+ "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93",
+ "6322fba6f3bcb1f6c8e78160d200809c",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f",
+ "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252",
+ "6491add6b665aafc364c8c104a6a233d",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337",
+ "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8",
+ "bb8c34b143910ba49b01d13e94d936ac",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056",
+ "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313",
+ "75f430fdea0b7b5b66866fd68a795a6a",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711",
+ "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9",
+ "73c73237a5d891096066b186abf96854",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1",
+ "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0",
+ "1b363db246aa5400f49479b7d5d41799",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697",
+ "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c",
+ "c6bcf564e63c42148d9917f089566432",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854",
+ "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146",
+ "f3c07ea65db08c639136a5a9270f95ff",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce",
+ "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7",
+ "5b58567f94ae9fa930f700c68c17399d",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6",
+ "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b",
+ "5187bb3df943d154cb01fb2f244ff86f",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb",
+ "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436",
+ "0a8c23047b0364f5687b62b01f043359",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b",
+ "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc",
+ "6e963a89beac6f3a362c269d1017f9a8",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205",
+ "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204",
+ "19020b82ce8bb49a511820c7e1d58e99",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38",
+ "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b",
+ "c12180a6dcde0c3579befbb5304ff70b",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e",
+ "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca",
+ "4d20aa440e38b6b7ac83c8c54d313169",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe",
+ "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9",
+ "e7640d81f891e1d06e7da75c6ae74d93",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8",
+ "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783",
+ "78e9017f7434665d32ec59795aed0012",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883",
+ "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee",
+ "b5951334dfbe6229d828e03cd2d98538",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230",
+ "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5",
+ "4530222ea1de546e202630fcf43f4526",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilterIntraPredTest12bpp = FilterIntraPredTest<12, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "27682e2763f742e0c7156a263af54fe1", "f6fe9b73d8a2024b3125d25a42028be3",
+ "8a232b8caa41f8c4f0b547f0aa072fd7", "411b24dc872e91de3a607f18b51c4e34",
+ "9a106b70ca2df5317afc90aba0316a98",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "a0d3f3a8f498727af0844a6df90da971", "bb02998e3d5d7b4643db616a5ce75c51",
+ "eaa39425427c155dea1836c37fc14f7e", "747cc4fa0c9e3418f4a15ded9f846599",
+ "c1a2aeaa01dd3edac4c26f74e01d8d57",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "80c01fdef14e3db28987e323801c998e", "de5a2f59384a096324eebe843d4b8ba5",
+ "f85e18efc9297793392607cdd84d8bc4", "d84bf2d9d4996c2f7fd82b6bbd52577b",
+ "9d73771de09c17bd494f1f5f75ab1111",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "7df2b038c4d816eb4949de6b933f0632", "0f1c45dd6e8d5534de0c9a279087ea8b",
+ "1b79f3b10facd9ffc404cbafdd73aa43", "e19adec4f14d72c5157f9faf7fc9b23e",
+ "a30ed988ea6ed797d4bf0945ffe7e330",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "097a0c14d89ece69e779fa755a2b75c0", "ebadfc559b20246dcd8d74413ff4d088",
+ "097c91bedc1e703b3eb54361d94df59a", "765bbad37b91e644292beac5f06811be",
+ "f3c809461fa3325f0d33087ca79c47d0",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "36464af48b38005b61f7f528a0b0c8ba", "47fa0868224c71d28d3cdcf247282c13",
+ "ca34bb57a37ee3e5428814ec63f52117", "420bdca6b643f4421d465345cc264167",
+ "339c124c07a611a65952dc9996ba6e12",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "99ca0d3b3fbdd4661a2c07bdb2752a70", "6fedae1dbfe721210b65e08dc77847dd",
+ "956810089f81dc9334103111afec2fbb", "ede4f0bee06def6d8a2037939415d845",
+ "ca146dfe0edbdac3066a0ca387fb6277",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "b0f7d5dbf7f9aa3f0ab13273de80dc9d", "a3537f2b60426e9f83aeef973161fcfd",
+ "d4f868f793ab232bee17b49afcfc28a0", "fc43429761d10723b5f377eb6513e59a",
+ "f59aabb06574ce24e1d1113753edb098",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "0b539f1e2ecf0300bf3838ab1d80952c", "44f01a4324cda8d27ea44a8bd3620526",
+ "a57819a22b422e7da9d85f09504a2c57", "dbff6a417a8f3606575acb3c98efe091",
+ "534e8e8cd4b73cb4f6ec22f903727efa",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "247192bd6a5c2821b8694e4669361103", "1935044a6220ac6315a58b402465b6da",
+ "bdce29a3e988b804d429da1446a34c2a", "4697132c20395fabac2662cb8b1ce35a",
+ "3d07a7beaff6925175fcd9a8e69542e6",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "3429b83b7ba723bdd2e3e368979b51b0", "cd099d0eb7f4a20547f91d9402e3394a",
+ "a6a7cc4e0f8ed34424264107b3657fb8", "0125ace62bec7c7ff7240bf5b6f689c5",
+ "a0722dba921b078a6d569ecb81777bf8",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "44b1b086ee37a93406e5db95dca825d7", "fdeed5c4644dc288f6dcc148e8d2867a",
+ "b241d112f6fa7a24c44706fb76e49132", "a782dcf01a16231276dbd20121bad640",
+ "4da9c0efd0bcb31f911af52779317fb9",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "bf9704995a0a868c45280cac3415c0a7", "373626072ade7c8d709ab732149fd3ae",
+ "9e4a2062aa86ac8dc5164002c953c7ca", "62eede30996d0e55afcf513fe9ad3c58",
+ "a5f3bb32688d5189341304d12e4e6449",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "bd93c4ddbe0f06e3f12be25ce490f68c", "bfe772b203b83c982f35a8ed0682cd16",
+ "d357ae05ce215f4c5af650ae82909081", "bd640d3c511edaac1753b64c81afb75d",
+ "4d05d67e02a7c4af7ae981b0eb8a4d7b",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+// Filter-intra and Cfl predictors are available only for transform sizes
+// with max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+ kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+ SmoothFuncs_C() = delete;
+
+ static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+ const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(
+ block_width >= 4 && block_height >= 4,
+ "Weights for smooth predictor undefined for block width/height < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+ // + 256. With the descale there's no need for saturation.
+ dst[x] = static_cast<Pixel>(
+ RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(block_height >= 4,
+ "Weights for smooth predictor undefined for block height < 4");
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ static_assert(block_width >= 4,
+ "Weights for smooth predictor undefined for block width < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_x[x]);
+ uint32_t pred = weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+ SmoothDefs() = delete;
+
+ using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+ using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+ using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+ using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+ using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+ using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+ using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+ using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+ using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+ using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+ using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+ using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+ using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+ using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+ using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+ using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+ using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+ using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+ using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H) \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+ DEFS::_##W##x##H::Smooth; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothVertical] = \
+ DEFS::_##W##x##H::SmoothVertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothHorizontal] = \
+ DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS) \
+ INIT_SMOOTH_WxH(DEFS, 4, 4); \
+ INIT_SMOOTH_WxH(DEFS, 4, 8); \
+ INIT_SMOOTH_WxH(DEFS, 4, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 4); \
+ INIT_SMOOTH_WxH(DEFS, 8, 8); \
+ INIT_SMOOTH_WxH(DEFS, 8, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 4); \
+ INIT_SMOOTH_WxH(DEFS, 16, 8); \
+ INIT_SMOOTH_WxH(DEFS, 16, 16); \
+ INIT_SMOOTH_WxH(DEFS, 16, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 64); \
+ INIT_SMOOTH_WxH(DEFS, 32, 8); \
+ INIT_SMOOTH_WxH(DEFS, 32, 16); \
+ INIT_SMOOTH_WxH(DEFS, 32, 32); \
+ INIT_SMOOTH_WxH(DEFS, 32, 64); \
+ INIT_SMOOTH_WxH(DEFS, 64, 16); \
+ INIT_SMOOTH_WxH(DEFS, 64, 32); \
+ INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(Defs);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(DefsHbd);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(DefsHbd);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+} // namespace
+
+void IntraPredSmoothInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+ // Weights are quadratic from '1' to '1 / block_size', scaled by
+ // 2^kSmoothWeightScale.
+ kSmoothWeightScale = 8,
+};
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// IntraPredTest
+
+template <int bitdepth, typename Pixel>
+class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ IntraPredTest() = default;
+ IntraPredTest(const IntraPredTest&) = delete;
+ IntraPredTest& operator=(const IntraPredTest&) = delete;
+ ~IntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredInit_C();
+ IntraPredSmoothInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_],
+ sizeof(base_intrapreds_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ IntraPredInit_SSE4_1();
+ IntraPredSmoothInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredInit_NEON();
+ IntraPredSmoothInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_],
+ sizeof(cur_intrapreds_));
+
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_intrapreds_[i] == base_intrapreds_[i]) {
+ cur_intrapreds_[i] = nullptr;
+ }
+ }
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ IntraPredictorFunc base_intrapreds_[kNumIntraPredictors];
+ IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors];
+};
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumIntraPredictors], const int num_runs) {
+ ASSERT_NE(digests, nullptr);
+ const auto* const left =
+ reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16);
+ const auto* const top =
+ reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16);
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ if (cur_intrapreds_[i] == nullptr) continue;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_),
+ ToString(static_cast<IntraPredictor>(i)),
+ digests[i], intra_pred_mem_.dst,
+ sizeof(intra_pred_mem_.dst), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ Pixel* const left = intra_pred_mem_.left_mem + 16;
+ Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ // skip DcFill
+ for (int i = 1; i < kNumIntraPredictors; ++i) {
+ if (cur_intrapreds_[i] == nullptr) continue;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i))
+ << " to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_intrapreds_[i] == nullptr) continue;
+ if (cur_intrapreds_[i] == nullptr) continue;
+ // It may be worthwhile to temporarily increase this loop size when testing
+ // changes that specifically affect this test.
+ for (int n = 0; n < 10000; ++n) {
+ intra_pred_mem_.Reset(&rnd);
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << ToString(static_cast<IntraPredictor>(i))
+ << " differs from reference in iteration #" << n;
+ break;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+using IntraPredTest8bpp = IntraPredTest<8, uint8_t>;
+
+const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50",
+ "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df",
+ "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7",
+ "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec",
+ "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7",
+ "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7",
+ "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0",
+ "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf",
+ "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c",
+ "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a",
+ "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5",
+ "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d",
+ "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7",
+ "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd",
+ "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4",
+ "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06",
+ "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0",
+ "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0",
+ "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1",
+ "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713",
+ "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194",
+ "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42",
+ "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968",
+ "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd",
+ "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82",
+ "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1",
+ "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289",
+ "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c",
+ "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462",
+ "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34",
+ "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538",
+ "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a",
+ "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5",
+ "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea",
+ "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f",
+ "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e",
+ "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8",
+ "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3",
+ "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634",
+ "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4",
+ "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d",
+ "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb",
+ "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f",
+ "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182",
+ "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07",
+ "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151",
+ "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d",
+ "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199",
+ "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52",
+ "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894",
+ "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b",
+ "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1",
+ "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72",
+ "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6",
+ "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d",
+ "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311",
+ "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9",
+ "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8",
+ "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202",
+ "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2",
+ "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d",
+ "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb",
+ "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512",
+ "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530",
+ "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9",
+ "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a",
+ "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3",
+ "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6",
+ "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4",
+ "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb",
+ "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530",
+ "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233",
+ "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e",
+ "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755",
+ "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe",
+ "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27",
+ "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraPredTest10bpp = IntraPredTest<10, uint16_t>;
+
+const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705",
+ "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96",
+ "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3",
+ "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25",
+ "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113",
+ "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6",
+ "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847",
+ "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968",
+ "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86",
+ "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3",
+ "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418",
+ "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb",
+ "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2",
+ "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417",
+ "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81",
+ "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa",
+ "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304",
+ "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01",
+ "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f",
+ "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348",
+ "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2",
+ "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec",
+ "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1",
+ "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec",
+ "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e",
+ "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239",
+ "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4",
+ "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b",
+ "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd",
+ "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b",
+ "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699",
+ "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284",
+ "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76",
+ "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175",
+ "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408",
+ "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34",
+ "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50",
+ "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5",
+ "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4",
+ "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0",
+ "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378",
+ "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5",
+ "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f",
+ "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb",
+ "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604",
+ "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab",
+ "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0",
+ "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94",
+ "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53",
+ "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80",
+ "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65",
+ "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a",
+ "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80",
+ "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd",
+ "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918",
+ "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a",
+ "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322",
+ "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493",
+ "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e",
+ "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998",
+ "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b",
+ "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751",
+ "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e",
+ "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e",
+ "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722",
+ "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba",
+ "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017",
+ "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217",
+ "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785",
+ "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1",
+ "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a",
+ "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86",
+ "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65",
+ "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a",
+ "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e",
+ "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c",
+ "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraPredTest12bpp = IntraPredTest<12, uint16_t>;
+
+const char* const* GetIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "f7008e0f65bdeed97375ae5e98e3309b", "a34cc5d9d1ef875df4ee2ce010d0a80a",
+ "74f615beeb217ad317ced813851be36a", "b3312e86313805b061c66a08e09de653",
+ "2db47240c95530b39084bdacccf4bb8e", "76bb839cac394b5777c64b6d4b570a27",
+ "a74ee60527be86059e822f7463f49ad5", "b157a40aaa14391c237471ba6d148a50",
+ "d4f7bd2e97e2b23f7a6a059837a10b2a", "8a9bcb30e9aff59b6feef5d1bf546d28",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "4c2a59e1d4a58c129c709f05d1a83f4a", "5fbedd99a90a20727195dfbe8f9969ad",
+ "d4645e21ccf5f6d3c4ca7a3d9b0156ba", "98aa17ea5423192c81a04afd2d2669ed",
+ "67dad5b5eefdeb2af1e4d3875b282c6c", "881dcafd6323509fb80cd5bbdf2870c4",
+ "03ece373dfd56bd2fd86ad00ad6f5000", "41b28f2578d2ed7f38e708d57b231948",
+ "9f935505190f52ff4da9556e43f607be", "815700d2abb055bce6902d130e77416d",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "bfc47cd4eef143a6ebf517730756a718", "ef07a3af3e353f9dfaebc48c8ac92c82",
+ "ceec5d9d24254efd3c6a00cbf11dd24d", "4e07f512a69cf95608c3c0c3013ed808",
+ "cedb7c900bb6839026bf79d054edb4fc", "48d958a18a019809f12eb2ad2eb358bc",
+ "8f296f4b9fb621a910368609cc2cccdf", "073a6f2ca8a23d6131ff97e2a3b736e1",
+ "f4772cc60b68c4f958c08c0fd8eb8d48", "2f8946cf19abecf0fda3addbfb8f9dcf",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "4f245b07a91e6d604da9f22cf277d6f1", "a6dc25d1e24ba9e842c312f67eea211d",
+ "0475204441f44ea95bfd69c6e04eaed8", "313bcf1e2fc762d31ff765d3c18a6f67",
+ "7e9223ece684a1885c2108741052c6c8", "79f1e6f070d9b1d0f1de2ff77bccc0dc",
+ "63adca1101ee4799b1cfa26d88aa0657", "e8b940a5e39ea5313930c903464de843",
+ "42a8e470d3b000f4f57c44c632f0051b", "e8a57663f73da3d4320f8e82a3fecfc2",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "7fa3c8bdd9ce04dc4df27863499cf4d4", "83f1312edd9af928a1cef60613730bc3",
+ "ceb35042adc6095a545b490f20e5d81b", "73aa503f329a055ff59a24093e682c41",
+ "14a9a427525ec38d2eb13e698728e911", "9143ddf66234e74acc156565d684fcac",
+ "05182bbe4fd90f3b496033ee5b7c54f9", "d9c6184c23af1f5a903a4a00539b883a",
+ "c4c2d4000ca2defc7a8169215121d9fc", "0b938bc7782b32796bffece28d17bb69",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "50197f063138616c37ef09f8bf8a3016", "ef2008f6d9f2176feb17b7d4312022e2",
+ "0d243ffbba0a2e65738d7ee768620c36", "51b52564a2733c2c56ba319db5d8e3b8",
+ "0e2b41482ac1347c3bb6d0e394fe7bec", "edb43c19850452e6b20dfb2c001adb0b",
+ "6cd29f537b5e4180f5aaefd9140b65ef", "6808f618bdff33e0f3d6db60ea487bc1",
+ "0303c17746192b0c52b4d75ea97ca24d", "225d1debd7828fa01bc9a610a443cda9",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "dc047c402c6ac4014d621fbd41b460d5", "49eb33c3a112f059e02d6d4b99da8b41",
+ "c906c9105a406ae6c63e69f57ed2fc7c", "2ead452591ddd2455660f96ce79314ab",
+ "437a2a78562752ee8291227f88e0323a", "51834dbdcf1e89667ffbb931bec9006c",
+ "959c1778e11a7c61a5a97176c79ecb6a", "2e51e44dd1953fc6fccc3b1c1ca602ed",
+ "7f94114cddb0ba780cc0c8d00db3f8d2", "b5b3770e6061249a3206915a3f9464e7",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "9deb173fa962d9adde8a9ae256708c32", "264624b41e43cfe9378ee9b4fb5028a6",
+ "404919a41bdc7f1a1f9d089223373bb8", "5294ed9fcc16eaf5f9a1f66a2a36ae7c",
+ "a2ed1fa4262bca265dcc62eb1586f0ac", "58494af62f86464dbe471130b2bc4ab0",
+ "fe1f25f7096fc3426cc7964326cc46ad", "cf7f6c8f7257436b9934cecf3b7523e1",
+ "6325036f243abfcd7777754e6a7bdacc", "9dce11a98e18422b04dd9d7be7d420da",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "92d5b7d4033dcd8cb729bf8e166e339a", "6cbd9f198828fd3422c9bfaf8c2f1c1d",
+ "2b204014b6dc477f67b36818bcdab1ca", "2ce0b9cf224d4654168c559d7c1424c2",
+ "ec70341b9dd57b379f5283820c9461c7", "3fe1e2a20e44171c90ebca5a45b83460",
+ "0305852b25351ff472a45f45ec1638fa", "565c78271fbe3b25b0eee542095be005",
+ "8bc15e98659cef6236bcb072541bb2ca", "875c87bf4daba7cb436ea2fdb5a427dd",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "c9d12bce78d8846f081345906e1315f4", "0b57c8fde6dec15458b1c289245100cb",
+ "1c11978c4e6bbc77767395c63d2f70a8", "e749f26b26b46d8cb7cb13c1c777db94",
+ "40459af05e865e94ff7adcdec1685c15", "f3ae419e99a60dbde3afa24ba6588a36",
+ "fe3912418bca24cee3132de2c193d1fc", "cdc8e3ce27a12f1cbfe01d1adf2eb6bd",
+ "ce354b30ce15a6918172dea55a292b93", "e762d01726d641194982a5fb8c148eb7",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "ad8f118b07e053df3887215449633a07", "e8979aa743aef82937d93d87fc9fdb85",
+ "a8afb62cbf602cfcd4b570832afe1d55", "404183cf003764a4f032f0f4810cd42c",
+ "4afcf1bc5589a13b11679571aa953b86", "202df8f5a2d7eb3816de172608115f2b",
+ "ce42bca92d6d7f9df85dbaac72e35064", "61c463c8070b78ca2bdf578044fec440",
+ "3abf6e4d779208e15e3f9a0dfc0254f9", "13df5504084105af7c66a1b013fe44e1",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "3ac1f642019493dec1b737d7a3a1b4e5", "cbf69d5d157c9f3355a4757b1d6e3414",
+ "96d00ddc7537bf7f196006591b733b4e", "8cba1b70a0bde29e8ef235cedc5faa7d",
+ "35f9ee300d7fa3c97338e81a6f21dcd4", "aae335442e77c8ebc280f16ea50ba9c7",
+ "a6140fdac2278644328be094d88731db", "2df93621b6ff100f7008432d509f4161",
+ "c77bf5aee39e7ed4a3dd715f816f452a", "02109bd63557d90225c32a8f1338258e",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "155688dec409ff50f2333c14a6367247", "cf935e78abafa6ff7258c5af229f55b6",
+ "b4bf83a28ba319c597151a041ff838c3", "fe97f3e6cd5fe6c5979670c11d940dda",
+ "b898c9a989e1e72461a6f47e913d5383", "bb73baa6476ce90118e83e2fd08f2299",
+ "c93be6d8ec318bd805899466821bb779", "ab366991ef842e9d417d52241f6966e6",
+ "9e7e4c96a271e9e40771eac39c21f661", "9459f2e6d1291b8b8a2fe0635ce1a33d",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "48374c1241409e26d81e5106c73da420", "97c918bdba2ece52156dbc776b9b70d4",
+ "a44ce9c03f6622a3e93bfe3b928eb6f1", "2384ad95e3e7302f20857121e187aa48",
+ "47e72c6dc0087b6fd99e91cff854c269", "142dc3cbb05b82a496780f7fc3d66ccc",
+ "4a39fb768efcd4f30d6eae816e6a68c4", "d0c31f9d52d984a0335557eafe2b47fa",
+ "81b3af5c7893729b837e4d304917f7cd", "941cbcd411887dc7fa3a5c7395690d1a",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "00892ee43a1bbb11347c1f44fb94b1a2", "d66397ba868e62cec99daf5ea73bebd0",
+ "65fe746e79ac1e779caae8abcc15eb6b", "8e308fe96b9845112d79c54f9d7981a0",
+ "47bc8847a7c9aed3417cd5250ba57875", "1a4008b7f0f61a3c73a2ee1d1452b414",
+ "24d25ef488bb457a5a4c4892e47a363d", "6d9d964f5317ab32a8edf57c23775238",
+ "544fc36c1a35c588359ae492cb5bc143", "ac170d94dbd944e9723de9c18bace1a3",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "7d0bd7dea26226741dbca9a97f27fa74", "a8bdc852ef704dd4975c61893e8fbc3f",
+ "f29d6d03c143ddf96fef04c19f2c8333", "ad9cfc395a5c5644a21d958c7274ac14",
+ "45c27c5cca9a91b6ae8379feb0881c9f", "8a0b78df1e001b85c874d686eac4aa1b",
+ "ce9fa75fac54a3f6c0cc3f2083b938f1", "c0dca10d88762c954af18dc9e3791a39",
+ "61df229eddfccab913b8fda4bb02f9ac", "4f4df6bc8d50a5600b573f0e44d70e66",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "e99d072de858094c98b01bd4a6772634", "525da4b187acd81b1ff1116b60461141",
+ "1348f249690d9eefe09d9ad7ead2c801", "a5e2f9fb685d5f4a048e9a96affd25a4",
+ "873bfa9dc24693f19721f7c8d527f7d3", "0acfc6507bd3468e9679efc127d6e4b9",
+ "57d03f8d079c7264854e22ac1157cfae", "6c2c4036f70c7d957a9399b5436c0774",
+ "42b8e4a97b7f8416c72a5148c031c0b1", "a38a2c5f79993dfae8530e9e25800893",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "68bd283cfd1a125f6b2ee47cee874d36", "b4581311a0a73d95dfac7f8f44591032",
+ "5ecc7fdc52d2f575ad4f2d0e9e6b1e11", "db9d82921fd88b24fdff6f849f2f9c87",
+ "804179f05c032908a5e36077bb87c994", "fc5fd041a8ee779015394d0c066ee43c",
+ "68f5579ccadfe9a1baafb158334a3db2", "fe237e45e215ab06d79046da9ad71e84",
+ "9a8a938a6824551bf7d21b8fd1d70ea1", "eb7332f2017cd96882c76e7136aeaf53",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "d9a906c0e692b22e1b4414e71a704b7e", "12ac11889ae5f55b7781454efd706a6a",
+ "3f1ef5f473a49eba743f17a3324adf9d", "a6baa0d4bfb2269a94c7a38f86a4bccf",
+ "47d4cadd56f70c11ff8f3e5d8df81161", "de997744cf24c16c5ac2a36b02b351cc",
+ "23781211ae178ddeb6c4bb97a6bd7d83", "a79d2e28340ca34b9e37daabbf030f63",
+ "0372bd3ddfc258750a6ac106b70587f4", "228ef625d9460cbf6fa253a16a730976",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+constexpr TransformSize kTransformSizes[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+constexpr uint8_t kTransformColumnShift = 4;
+
+template <typename T>
+int32_t RangeCheckValue(T value, int8_t range) {
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+ static_assert(
+ std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value,
+ "");
+ assert(range <= 32);
+ const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1)));
+ const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1);
+ if (min > value || value > max) {
+ LIBGAV1_DLOG(ERROR,
+ "coeff out of bit range, value: %" PRId64 " bit range %d",
+ static_cast<int64_t>(value), range);
+ assert(min <= value && value <= max);
+ }
+#endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+ static_cast<void>(range);
+ return static_cast<int32_t>(value);
+}
+
+template <typename Residual>
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a,
+ int b, int angle, bool flip,
+ int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const int64_t x = static_cast<int64_t>(dst[a] * Cos128(angle)) -
+ static_cast<int64_t>(dst[b] * Sin128(angle));
+ const int64_t y = static_cast<int64_t>(dst[a] * Sin128(angle)) +
+ static_cast<int64_t>(dst[b] * Cos128(angle));
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b,
+ int angle, bool flip, int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const auto x = static_cast<int64_t>(dst[b] * -Sin128(angle));
+ const auto y = static_cast<int64_t>(dst[b] * Cos128(angle));
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b,
+ int angle, bool flip, int8_t range) {
+ // Note that we multiply in 32 bits and then add/subtract the products in 64
+ // bits. The 32-bit multiplications do not overflow. Please see the comment
+ // and assert() in Cos128().
+ const auto x = static_cast<int64_t>(dst[a] * Cos128(angle));
+ const auto y = static_cast<int64_t>(dst[a] * Sin128(angle));
+
+ // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+ // values saved into the array T by this function are representable by a
+ // signed integer using |range| bits of precision.
+ dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+ dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void HadamardRotation_C(Residual* const dst, int a, int b, bool flip,
+ int8_t range) {
+ if (flip) std::swap(a, b);
+ --range;
+ // For Adst and Dct, the maximum possible value for range is 20. So min and
+ // max should always fit into int32_t.
+ const int32_t min = -(1 << range);
+ const int32_t max = (1 << range) - 1;
+ const int32_t x = dst[a] + dst[b];
+ const int32_t y = dst[a] - dst[b];
+ dst[a] = Clip3(x, min, max);
+ dst[b] = Clip3(y, min, max);
+}
+
+template <int bitdepth, typename Residual>
+void ClampIntermediate(Residual* const dst, int size) {
+ // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+ // clip residual[i][j] to 16 bits.
+ if (sizeof(Residual) > 2) {
+ const Residual intermediate_clamp_max =
+ (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+ const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+ for (int j = 0; j < size; ++j) {
+ dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+// Value for index (i, j) is computed as bitreverse(j) and interpreting that as
+// an integer with bit-length i + 2.
+// For e.g. index (2, 3) will be computed as follows:
+// * bitreverse(3) = bitreverse(..000011) = 110000...
+// * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12
+constexpr uint8_t kBitReverseLookup[kNumTransform1dSizes][64] = {
+ {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2,
+ 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3,
+ 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3},
+ {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5,
+ 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6,
+ 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7},
+ {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15},
+ {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31},
+ {0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
+
+template <typename Residual, int size_log2>
+void Dct_C(void* dest, int8_t range) {
+ static_assert(size_log2 >= 2 && size_log2 <= 6, "");
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ const int size = 1 << size_log2;
+ Residual temp[size];
+ memcpy(temp, dst, sizeof(temp));
+ for (int i = 0; i < size; ++i) {
+ dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
+ }
+ // stages 2-32 are dependent on the value of size_log2.
+ // stage 2.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 16; ++i) {
+ ButterflyRotation_C(dst, i + 32, 63 - i,
+ 63 - MultiplyBy4(kBitReverseLookup[2][i]), false,
+ range);
+ }
+ }
+ // stage 3
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, i + 16, 31 - i,
+ 6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false,
+ range);
+ }
+ }
+ // stage 4.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 16; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 5.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, i + 8, 15 - i,
+ 12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false,
+ range);
+ }
+ }
+ // stage 6.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 7.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(
+ dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33,
+ 60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true,
+ range);
+ }
+ }
+ }
+ // stage 8.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range);
+ }
+ }
+ // stage 9.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+ static_cast<bool>(i & 1), range);
+ }
+ }
+ // stage 10.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(
+ dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17,
+ 24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range);
+ }
+ }
+ }
+ // stage 11.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 32,
+ MultiplyBy4(i) - j + 35, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 12.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i,
+ i == 0, range);
+ }
+ // stage 13.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 2; ++i) {
+ HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
+ /*flip=*/i != 0, range);
+ }
+ }
+ // stage 14.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range);
+ }
+ }
+ // stage 15.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 16,
+ MultiplyBy4(i) - j + 19, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 16.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ ButterflyRotation_C(
+ dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34,
+ 56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range);
+ }
+ }
+ }
+ // stage 17.
+ for (int i = 0; i < 2; ++i) {
+ HadamardRotation_C(dst, i, 3 - i, false, range);
+ }
+ // stage 18.
+ if (size_log2 >= 3) {
+ ButterflyRotation_C(dst, 6, 5, 32, true, range);
+ }
+ // stage 19.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
+ /*flip=*/i != 0, range);
+ }
+ }
+ }
+ // stage 20.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true,
+ range);
+ }
+ }
+ // stage 21.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(dst, MultiplyBy8(i) + j + 32,
+ MultiplyBy8(i) - j + 39, static_cast<bool>(i & 1),
+ range);
+ }
+ }
+ }
+ // stage 22.
+ if (size_log2 >= 3) {
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(dst, i, 7 - i, false, range);
+ }
+ }
+ // stage 23.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range);
+ }
+ }
+ // stage 24.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(dst, MultiplyBy8(i) + j + 16,
+ MultiplyBy8(i) - j + 23, i == 1, range);
+ }
+ }
+ }
+ // stage 25.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range);
+ }
+ }
+ // stage 26.
+ if (size_log2 >= 4) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, i, 15 - i, false, range);
+ }
+ }
+ // stage 27.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range);
+ }
+ }
+ // stage 28.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(dst, i + 32, 47 - i, false, range);
+ HadamardRotation_C(dst, i + 48, 63 - i, true, range);
+ }
+ }
+ // stage 29.
+ if (size_log2 >= 5) {
+ for (int i = 0; i < 16; ++i) {
+ HadamardRotation_C(dst, i, 31 - i, false, range);
+ }
+ }
+ // stage 30.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range);
+ }
+ }
+ // stage 31.
+ if (size_log2 == 6) {
+ for (int i = 0; i < 32; ++i) {
+ HadamardRotation_C(dst, i, 63 - i, false, range);
+ }
+ }
+}
+
+template <int bitdepth, typename Residual, int size_log2>
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row && should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range);
+
+ if (is_row && row_shift > 0) {
+ dst[0] = RightShiftWithRounding(dst[0], row_shift);
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+
+ const int size = 1 << size_log2;
+ for (int i = 1; i < size; ++i) {
+ dst[i] = dst[0];
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+/*
+ * Row transform max range in bits for bitdepths 8/10/12: 28/30/32.
+ * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
+ */
+template <typename Residual>
+void Adst4_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
+ return;
+ }
+
+ // stage 1.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+ // values stored in the s and x arrays by this process are representable by
+ // a signed integer using range + 12 bits of precision.
+ // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit
+ // content. For simplicity in unoptimized code, int64_t is used for both 10 &
+ // 12-bit. SIMD implementations can allow these to rollover on platforms
+ // where this has defined behavior.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ Intermediate s[7];
+ s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+ s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+ s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+ s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+ s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+ s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
+ // stage 2.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that
+ // values stored in the variable a7 by this process are representable by a
+ // signed integer using range + 1 bits of precision.
+ const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that
+ // values stored in the variable b7 by this process are representable by a
+ // signed integer using |range| bits of precision.
+ const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
+ // stage 3.
+ s[0] = RangeCheckValue(s[0] + s[3], range + 12);
+ s[1] = RangeCheckValue(s[1] - s[4], range + 12);
+ s[3] = s[2];
+ // With range checking enabled b7 would be trapped above. This prevents an
+ // integer sanitizer warning. In SIMD implementations the multiply can be
+ // allowed to rollover on platforms where this has defined behavior.
+ const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7;
+ s[2] = RangeCheckValue(adst2_b7, range + 12);
+ // stage 4.
+ s[0] = RangeCheckValue(s[0] + s[5], range + 12);
+ s[1] = RangeCheckValue(s[1] - s[6], range + 12);
+ // stages 5 and 6.
+ const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12);
+ const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12);
+ Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12);
+ x3 = RangeCheckValue(x3 - s[3], range + 12);
+ auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12));
+ auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12));
+ auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12));
+ auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12));
+ if (sizeof(Residual) == 2) {
+ // If the first argument to RightShiftWithRounding(..., 12) is only
+ // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+ // in RightShiftWithRounding(..., 12) will cause the function to return
+ // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+ dst_0 -= (dst_0 == 0x8000);
+ dst_1 -= (dst_1 == 0x8000);
+ dst_3 -= (dst_3 == 0x8000);
+ }
+ dst[0] = dst_0;
+ dst[1] = dst_1;
+ dst[2] = dst_2;
+ dst[3] = dst_3;
+}
+
+template <int bitdepth, typename Residual>
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row && should_round) {
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 1.
+ // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+ // values stored in the s and x arrays by this process are representable by
+ // a signed integer using range + 12 bits of precision.
+ int32_t s[3];
+ s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+ s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12);
+ // stage 3.
+ // stage 4.
+ // stages 5 and 6.
+ int32_t dst_0 = RightShiftWithRounding(s[0], 12);
+ int32_t dst_1 = RightShiftWithRounding(s[1], 12);
+ int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+ int32_t dst_3 =
+ RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12);
+ if (sizeof(Residual) == 2) {
+ // If the first argument to RightShiftWithRounding(..., 12) is only
+ // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+ // in RightShiftWithRounding(..., 12) will cause the function to return
+ // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+ dst_0 -= (dst_0 == 0x8000);
+ dst_1 -= (dst_1 == 0x8000);
+ dst_3 -= (dst_3 == 0x8000);
+ }
+ dst[0] = dst_0;
+ dst[1] = dst_1;
+ dst[2] = dst_2;
+ dst[3] = dst_3;
+
+ const int size = 4;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+template <typename Residual>
+void AdstInputPermutation(int32_t* LIBGAV1_RESTRICT const dst,
+ const Residual* LIBGAV1_RESTRICT const src, int n) {
+ assert(n == 8 || n == 16);
+ for (int i = 0; i < n; ++i) {
+ dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
+ }
+}
+
+constexpr int8_t kAdstOutputPermutationLookup[16] = {
+ 0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
+
+template <typename Residual>
+void AdstOutputPermutation(Residual* LIBGAV1_RESTRICT const dst,
+ const int32_t* LIBGAV1_RESTRICT const src, int n) {
+ assert(n == 8 || n == 16);
+ const auto shift = static_cast<int8_t>(n == 8);
+ for (int i = 0; i < n; ++i) {
+ const int8_t index = kAdstOutputPermutationLookup[i] >> shift;
+ int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index];
+ if (sizeof(Residual) == 2) {
+ // If i is odd and src[index] is -32768, dst_i will be 32768, which
+ // cannot be represented as an int16_t.
+ dst_i -= (dst_i == 0x8000);
+ }
+ dst[i] = dst_i;
+ }
+}
+
+template <typename Residual>
+void Adst8_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ int32_t temp[8];
+ AdstInputPermutation(temp, dst, 8);
+ // stage 2.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
+ true, range);
+ }
+ // stage 3.
+ for (int i = 0; i < 4; ++i) {
+ HadamardRotation_C(temp, i, i + 4, false, range);
+ }
+ // stage 4.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range);
+ }
+ // stage 5.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+ false, range);
+ }
+ }
+ // stage 6.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+ // stage 7.
+ AdstOutputPermutation(dst, temp, 8);
+}
+
+template <int bitdepth, typename Residual>
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ // stage 1.
+ int32_t temp[8];
+ // After the permutation, the dc value is in temp[1]. The remaining are zero.
+ AdstInputPermutation(temp, dst, 8);
+
+ if (is_row && should_round) {
+ temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 2.
+ ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range);
+
+ // stage 3.
+ temp[4] = temp[0];
+ temp[5] = temp[1];
+
+ // stage 4.
+ ButterflyRotation_C(temp, 4, 5, 48, true, range);
+
+ // stage 5.
+ temp[2] = temp[0];
+ temp[3] = temp[1];
+ temp[6] = temp[4];
+ temp[7] = temp[5];
+
+ // stage 6.
+ ButterflyRotation_C(temp, 2, 3, 32, true, range);
+ ButterflyRotation_C(temp, 6, 7, 32, true, range);
+
+ // stage 7.
+ AdstOutputPermutation(dst, temp, 8);
+
+ const int size = 8;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 8);
+}
+
+template <typename Residual>
+void Adst16_C(void* dest, int8_t range) {
+ auto* const dst = static_cast<Residual*>(dest);
+ // stage 1.
+ int32_t temp[16];
+ AdstInputPermutation(temp, dst, 16);
+ // stage 2.
+ for (int i = 0; i < 8; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
+ true, range);
+ }
+ // stage 3.
+ for (int i = 0; i < 8; ++i) {
+ HadamardRotation_C(temp, i, i + 8, false, range);
+ }
+ // stage 4.
+ for (int i = 0; i < 2; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+ 56 - 32 * i, true, range);
+ ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12,
+ 8 + 32 * i, true, range);
+ }
+ // stage 5.
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4,
+ false, range);
+ }
+ }
+ // stage 6.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4,
+ i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range);
+ }
+ }
+ // stage 7.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+ false, range);
+ }
+ }
+ // stage 8.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+ // stage 9.
+ AdstOutputPermutation(dst, temp, 16);
+}
+
+template <int bitdepth, typename Residual>
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
+ auto* const dst = static_cast<Residual*>(dest);
+
+ // stage 1.
+ int32_t temp[16];
+ // After the permutation, the dc value is in temp[1]. The remaining are zero.
+ AdstInputPermutation(temp, dst, 16);
+
+ if (is_row && should_round) {
+ temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+ }
+
+ // stage 2.
+ ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range);
+
+ // stage 3.
+ temp[8] = temp[0];
+ temp[9] = temp[1];
+
+ // stage 4.
+ ButterflyRotation_C(temp, 8, 9, 56, true, range);
+
+ // stage 5.
+ temp[4] = temp[0];
+ temp[5] = temp[1];
+ temp[12] = temp[8];
+ temp[13] = temp[9];
+
+ // stage 6.
+ ButterflyRotation_C(temp, 4, 5, 48, true, range);
+ ButterflyRotation_C(temp, 12, 13, 48, true, range);
+
+ // stage 7.
+ temp[2] = temp[0];
+ temp[3] = temp[1];
+ temp[10] = temp[8];
+ temp[11] = temp[9];
+
+ temp[6] = temp[4];
+ temp[7] = temp[5];
+ temp[14] = temp[12];
+ temp[15] = temp[13];
+
+ // stage 8.
+ for (int i = 0; i < 4; ++i) {
+ ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+ range);
+ }
+
+ // stage 9.
+ AdstOutputPermutation(dst, temp, 16);
+
+ const int size = 16;
+ if (is_row && row_shift > 0) {
+ for (int j = 0; j < size; ++j) {
+ dst[j] = RightShiftWithRounding(dst[j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(dst, 16);
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+//
+// In the spec, the inverse identity transform is followed by a Round2() call:
+// The row transforms with i = 0..(h-1) are applied as follows:
+// ...
+// * Otherwise, invoke the inverse identity transform process specified in
+// section 7.13.2.15 with the input variable n equal to log2W.
+// * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift )
+// for j = 0..(w-1).
+// ...
+// The column transforms with j = 0..(w-1) are applied as follows:
+// ...
+// * Otherwise, invoke the inverse identity transform process specified in
+// section 7.13.2.15 with the input variable n equal to log2H.
+// * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift )
+// for i = 0..(h-1).
+//
+// Therefore, we define the identity transform functions to perform both the
+// inverse identity transform and the Round2() call. This has two advantages:
+// 1. The outputs of the inverse identity transform do not need to be stored
+// in the Residual array. They can be stored in int32_t local variables,
+// which have a larger range if Residual is an int16_t array.
+// 2. The inverse identity transform and the Round2() call can be jointly
+// optimized.
+//
+// The identity transform functions have the following prototype:
+// void Identity_C(void* dest, int8_t shift);
+//
+// The |shift| parameter is the amount of shift for the Round2() call. For row
+// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
+// 4. Therefore, an identity transform function can detect whether it is being
+// invoked as a row transform or a column transform by checking whether |shift|
+// is equal to 4.
+//
+// Input Range
+//
+// The inputs of row transforms, stored in the 2D array Dequant, are
+// representable by a signed integer using 8 + BitDepth bits of precision:
+// f. Dequant[ i ][ j ] is set equal to
+// Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ).
+//
+// The inputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) bits of precision:
+// Set the variable colClampRange equal to Max( BitDepth + 6, 16 ).
+// ...
+// Between the row and column transforms, Residual[ i ][ j ] is set equal to
+// Clip3( - ( 1 << ( colClampRange - 1 ) ),
+// ( 1 << (colClampRange - 1 ) ) - 1,
+// Residual[ i ][ j ] )
+// for i = 0..(h-1), for j = 0..(w-1).
+//
+// Output Range
+//
+// The outputs of row transforms are representable by a signed integer using
+// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect
+// of the multiplicative factor of inverse identity transforms minus the
+// smallest row shift is an increase of at most one bit.
+//
+// Transform | Multiplicative factor | Smallest row | Net increase
+// width | (in bits) | shift | in bits
+// ---------------------------------------------------------------
+// 4 | sqrt(2) (0.5 bits) | 0 | +0.5
+// 8 | 2 (1 bit) | 0 | +1
+// 16 | 2*sqrt(2) (1.5 bits) | 1 | +0.5
+// 32 | 4 (2 bits) | 1 | +1
+//
+// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we
+// clip the outputs (which have 17 bits of precision) to the range of int16_t
+// before storing them in the Residual array. This clipping happens to be the
+// same as the required clipping after the row transform (see the spec quoted
+// above), so we remain compliant with the spec. (In this case,
+// TransformLoop_C() skips clipping the outputs of row transforms to avoid
+// duplication of effort.)
+//
+// The outputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision,
+// because the multiplicative factor of inverse identity transforms is at most
+// 4 (2 bits) and |shift| is always 4.
+
+template <typename Residual>
+void Identity4Row_C(void* dest, int8_t shift) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ assert(shift == 0 || shift == 1);
+ auto* const dst = static_cast<Residual*>(dest);
+ // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
+ // should be (1 + (1 << 1)) << 11. The following expression works for both
+ // values of |shift|.
+ const int32_t rounding = (1 + (shift << 1)) << 11;
+ for (int i = 0; i < 4; ++i) {
+ const auto intermediate =
+ static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ for (int i = 0; i < 4; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity4Multiplier as int32_t.
+ dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
+ }
+
+ const int32_t rounding = (1 + (row_shift << 1)) << 11;
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity8Row_C(void* dest, int8_t shift) {
+ assert(shift == 0 || shift == 1 || shift == 2);
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 8; ++i) {
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 8; ++i) {
+ dst[i] = static_cast<Residual>(
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
+ }
+
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+ // clip residual[i][j] to 16 bits.
+ if (sizeof(Residual) > 2) {
+ const Residual intermediate_clamp_max =
+ (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+ const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+ dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max);
+ }
+ return;
+ }
+
+ dst[0] = static_cast<Residual>(
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
+}
+
+template <typename Residual>
+void Identity16Row_C(void* dest, int8_t shift) {
+ assert(shift == 1 || shift == 2);
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << shift)) << 11;
+ for (int i = 0; i < 16; ++i) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for all cases.
+ const auto intermediate =
+ static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ for (int i = 0; i < 16; ++i) {
+ // The intermediate value here will have to fit into an int32_t for it to be
+ // bitstream conformant. The multiplication is promoted to int32_t by
+ // defining kIdentity16Multiplier as int32_t.
+ dst[i] =
+ static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
+ }
+
+ const int32_t rounding = (1 + (1 << row_shift)) << 11;
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
+ (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity32Row_C(void* dest, int8_t shift) {
+ assert(shift == 1 || shift == 2);
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 32; ++i) {
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[i] = static_cast<Residual>(dst_i);
+ }
+}
+
+template <typename Residual>
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ for (int i = 0; i < 32; ++i) {
+ dst[i] = static_cast<Residual>(
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
+ }
+}
+
+template <int bitdepth, typename Residual>
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ auto* const dst = static_cast<Residual*>(dest);
+
+ if (is_row) {
+ if (should_round) {
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
+ }
+
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
+ if (sizeof(Residual) == 2) {
+ dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+ }
+ dst[0] = static_cast<Residual>(dst_i);
+
+ ClampIntermediate<bitdepth, Residual>(dst, 1);
+ return;
+ }
+
+ dst[0] = static_cast<Residual>(
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+template <typename Residual>
+void Wht4_C(void* dest, int8_t shift) {
+ auto* const dst = static_cast<Residual*>(dest);
+ Residual temp[4];
+ temp[0] = dst[0] >> shift;
+ temp[2] = dst[1] >> shift;
+ temp[3] = dst[2] >> shift;
+ temp[1] = dst[3] >> shift;
+ temp[0] += temp[2];
+ temp[3] -= temp[1];
+ // This signed right shift must be an arithmetic shift.
+ Residual e = (temp[0] - temp[3]) >> 1;
+ dst[1] = e - temp[1];
+ dst[2] = e - temp[2];
+ dst[0] = temp[0] - dst[1];
+ dst[3] = temp[3] + dst[2];
+}
+
+template <int bitdepth, typename Residual>
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+ int /*row_shift*/, bool /*is_row*/) {
+ auto* const dst = static_cast<Residual*>(dest);
+ const int shift = range;
+
+ Residual temp = dst[0] >> shift;
+ // This signed right shift must be an arithmetic shift.
+ Residual e = temp >> 1;
+ dst[0] = temp - e;
+ dst[1] = e;
+ dst[2] = e;
+ dst[3] = e;
+
+ ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loop
+
+using InverseTransform1dFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+ bool should_round, int row_shift,
+ bool is_row);
+
+template <int bitdepth, typename Residual, typename Pixel,
+ Transform1d transform1d_type,
+ InverseTransformDcOnlyFunc dconly_transform1d,
+ InverseTransform1dFunc transform1d_func, bool is_row>
+void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ constexpr bool lossless = transform1d_type == kTransform1dWht;
+ constexpr bool is_identity = transform1d_type == kTransform1dIdentity;
+ // The transform size of the WHT is always 4x4. Setting tx_width and
+ // tx_height to the constant 4 for the WHT speeds the code up.
+ assert(!lossless || tx_size == kTransformSize4x4);
+ const int tx_width = lossless ? 4 : kTransformWidth[tx_size];
+ const int tx_height = lossless ? 4 : kTransformHeight[tx_size];
+ const int tx_width_log2 = kTransformWidthLog2[tx_size];
+ const int tx_height_log2 = kTransformHeightLog2[tx_size];
+ auto* frame = static_cast<Array2DView<Pixel>*>(dst_frame);
+
+ // Initially this points to the dequantized values. After the transforms are
+ // applied, this buffer contains the residual.
+ Array2DView<Residual> residual(tx_height, tx_width,
+ static_cast<Residual*>(src_buffer));
+
+ if (is_row) {
+ // Row transform.
+ const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size];
+ // This is the |range| parameter of the InverseTransform1dFunc. For lossy
+ // transforms, this will be equal to the clamping range.
+ const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8);
+ // If the width:height ratio of the transform size is 2:1 or 1:2, multiply
+ // the input to the row transform by 1 / sqrt(2), which is approximated by
+ // the fraction 2896 / 2^12.
+ const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
+
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+ true);
+ return;
+ }
+
+ // Row transforms need to be done only up to 32 because the rest of the rows
+ // are always all zero if |tx_height| is 64. Otherwise, only process the
+ // rows that have a non zero coefficients.
+ for (int i = 0; i < adjusted_tx_height; ++i) {
+ // If lossless, the transform size is 4x4, so should_round is false.
+ if (!lossless && should_round) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int j = 0; j < std::min(tx_width, 32); ++j) {
+ residual[i][j] = RightShiftWithRounding(
+ residual[i][j] * kTransformRowMultiplier, 12);
+ }
+ }
+ // For identity transform, |transform1d_func| also performs the
+ // Round2(T[j], rowShift) call in the spec.
+ transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
+ if (!lossless && !is_identity && row_shift > 0) {
+ for (int j = 0; j < tx_width; ++j) {
+ residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
+ }
+ }
+
+ ClampIntermediate<bitdepth, Residual>(residual[i], tx_width);
+ }
+ return;
+ }
+
+ assert(!is_row);
+ constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift;
+ // This is the |range| parameter of the InverseTransform1dFunc. For lossy
+ // transforms, this will be equal to the clamping range.
+ const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16);
+ const bool flip_rows = transform1d_type == kTransform1dAdst &&
+ kTransformFlipRowsMask.Contains(tx_type);
+ const bool flip_columns =
+ !lossless && kTransformFlipColumnsMask.Contains(tx_type);
+ const int min_value = 0;
+ const int max_value = (1 << bitdepth) - 1;
+ // Note: 64 is the maximum size of a 1D transform buffer (the largest
+ // transform size is kTransformSize64x64).
+ Residual tx_buffer[64];
+ for (int j = 0; j < tx_width; ++j) {
+ const int flipped_j = flip_columns ? tx_width - j - 1 : j;
+ int i = 0;
+ do {
+ tx_buffer[i] = residual[i][flipped_j];
+ } while (++i != tx_height);
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
+ } else {
+ // For identity transform, |transform1d_func| also performs the
+ // Round2(T[i], colShift) call in the spec.
+ transform1d_func(tx_buffer,
+ is_identity ? column_shift : column_clamp_range);
+ }
+ const int x = start_x + j;
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int index = flip_rows ? tx_height - i - 1 : i;
+ Residual residual_value = tx_buffer[index];
+ if (!lossless && !is_identity) {
+ residual_value = RightShiftWithRounding(residual_value, column_shift);
+ }
+ (*frame)[y][x] =
+ Clip3((*frame)[y][x] + residual_value, min_value, max_value);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+template <int bitdepth, typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/false>;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/false>;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity4DcOnly_C<bitdepth, Residual>,
+ Identity4Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity4DcOnly_C<bitdepth, Residual>,
+ Identity4Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity8DcOnly_C<bitdepth, Residual>,
+ Identity8Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity8DcOnly_C<bitdepth, Residual>,
+ Identity8Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity16DcOnly_C<bitdepth, Residual>,
+ Identity16Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity16DcOnly_C<bitdepth, Residual>,
+ Identity16Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity32DcOnly_C<bitdepth, Residual>,
+ Identity32Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+ Identity32DcOnly_C<bitdepth, Residual>,
+ Identity32Column_C<Residual>, /*is_row=*/false>;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/false>;
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<8, int16_t, uint8_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity16DcOnly_C<8, int16_t>,
+ Identity16Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+ Identity32DcOnly_C<8, int16_t>,
+ Identity32Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<10, int32_t, uint16_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<10, int32_t>,
+ Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<10, int32_t>,
+ Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<10, int32_t>,
+ Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<10, int32_t>,
+ Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<12, int32_t, uint16_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<12, int32_t>,
+ Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<12, int32_t>,
+ Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<12, int32_t>,
+ Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<12, int32_t>,
+ Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void InverseTransformInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+
+ // Local functions that may be unused depending on the optimizations
+ // available.
+ static_cast<void>(kBitReverseLookup);
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/inverse_transform_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/inverse_transform_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms. This function is not thread-safe.
+void InverseTransformInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for inverse transform implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)).
+constexpr int16_t kCos128[65] = {
+ 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+ 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+ 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+ 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+ 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+ 897, 799, 700, 601, 501, 401, 301, 201, 101, 0};
+
+inline int16_t Cos128(int angle) {
+ angle &= 0xff;
+
+ // If |angle| is 128, this function returns -4096 (= -2^12), which will
+ // cause the 32-bit multiplications in ButterflyRotation() to overflow if
+ // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20):
+ //
+ // (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t.
+ //
+ // Note: |range| is 20 when bitdepth is 12 and a row transform is performed.
+ //
+ // Assert that this angle is never used by DCT or ADST.
+ assert(angle != 128);
+ if (angle <= 64) return kCos128[angle];
+ if (angle <= 128) return -kCos128[128 - angle];
+ if (angle <= 192) return -kCos128[angle - 128];
+ return kCos128[256 - angle];
+}
+
+inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
+
+// The value for index i is derived as:
+// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
+constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
+
+constexpr uint8_t kTransformRowShift[kNumTransformSizes] = {
+ 0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2};
+
+constexpr bool kShouldRound[kNumTransformSizes] = {
+ false, true, false, true, false, true, false, false, true, false,
+ true, false, false, true, false, true, false, true, false};
+
+constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1;
+constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/
+ = 0x6A1;
+constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586;
+constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896;
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kTransform1dSizeNames[kNumTransform1dSizes] = {
+ "kTransform1dSize4", "kTransform1dSize8", "kTransform1dSize16",
+ "kTransform1dSize32", "kTransform1dSize64"};
+
+constexpr Transform1dSize kRowTransform1dSizes[] = {
+ kTransform1dSize4, kTransform1dSize4, kTransform1dSize4,
+ kTransform1dSize8, kTransform1dSize8, kTransform1dSize8,
+ kTransform1dSize8, kTransform1dSize16, kTransform1dSize16,
+ kTransform1dSize16, kTransform1dSize16, kTransform1dSize16,
+ kTransform1dSize32, kTransform1dSize32, kTransform1dSize32,
+ kTransform1dSize32, kTransform1dSize64, kTransform1dSize64,
+ kTransform1dSize64};
+
+constexpr Transform1dSize kColTransform1dSizes[] = {
+ kTransform1dSize4, kTransform1dSize8, kTransform1dSize16,
+ kTransform1dSize4, kTransform1dSize8, kTransform1dSize16,
+ kTransform1dSize32, kTransform1dSize4, kTransform1dSize8,
+ kTransform1dSize16, kTransform1dSize32, kTransform1dSize64,
+ kTransform1dSize8, kTransform1dSize16, kTransform1dSize32,
+ kTransform1dSize64, kTransform1dSize16, kTransform1dSize32,
+ kTransform1dSize64};
+
+template <int bitdepth, typename SrcPixel, typename DstPixel>
+class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ InverseTransformTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ InverseTransformTestBase(const InverseTransformTestBase&) = delete;
+ InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete;
+ ~InverseTransformTestBase() override = default;
+
+ protected:
+ struct InverseTransformMem {
+ void Reset(libvpx_test::ACMRandom* rnd, int width, int height) {
+ ASSERT_NE(rnd, nullptr);
+ // Limit the size of the residual values to bitdepth + sign in order
+ // to prevent outranging in the transforms.
+ const int num_bits = bitdepth + 1;
+ const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits;
+ const int mask = (1 << num_bits) - 1;
+ // Fill residual with random data. For widths == 64, only fill the upper
+ // left 32 x min(block_height_, 32).
+ memset(ref_src, 0, sizeof(ref_src));
+ SrcPixel* r = ref_src;
+ const int stride = width;
+ for (int y = 0; y < std::min(height, 32); ++y) {
+ for (int x = 0; x < std::min(width, 32); ++x) {
+ r[x] = rnd->Rand16() & mask;
+ // The msb of num_bits is the sign bit, so force each 16 bit value to
+ // the correct sign.
+ r[x] = (r[x] << sign_shift) >> sign_shift;
+ }
+ r += stride;
+ }
+
+ // Set frame data to random values.
+ for (int y = 0; y < kMaxBlockSize; ++y) {
+ for (int x = 0; x < kMaxBlockSize; ++x) {
+ const int mask = (1 << bitdepth) - 1;
+ cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] =
+ rnd->Rand16() & mask;
+ }
+ }
+ }
+
+ // Set ref_src to |pixel|.
+ void Set(const SrcPixel pixel) {
+ for (auto& r : ref_src) r = pixel;
+ }
+
+ alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels];
+ alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels];
+
+ alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels];
+ alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels];
+
+ alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ InverseTransformMem inverse_transform_mem_;
+};
+
+//------------------------------------------------------------------------------
+// InverseTransformTest
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+class InverseTransformTest
+ : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ InverseTransformTest() = default;
+ InverseTransformTest(const InverseTransformTest&) = delete;
+ InverseTransformTest& operator=(const InverseTransformTest&) = delete;
+ ~InverseTransformTest() override = default;
+
+ protected:
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_;
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_;
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_;
+ using InverseTransformTestBase<bitdepth, Pixel,
+ DstPixel>::inverse_transform_mem_;
+
+ void SetUp() override {
+ InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp();
+ InverseTransformInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+
+ tx_size_1d_row_ = kRowTransform1dSizes[tx_size_];
+ tx_size_1d_column_ = kColTransform1dSizes[tx_size_];
+
+ memcpy(base_inverse_transforms_, dsp->inverse_transforms,
+ sizeof(base_inverse_transforms_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ InverseTransformInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ InverseTransformInit_NEON();
+ InverseTransformInit10bpp_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_inverse_transforms_, dsp->inverse_transforms,
+ sizeof(cur_inverse_transforms_));
+
+ for (int i = 0; i < kNumTransform1ds; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] ==
+ base_inverse_transforms_[i][tx_size_1d_row_][kRow]) {
+ cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr;
+ }
+ if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] ==
+ base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) {
+ cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr;
+ }
+ }
+
+ base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+ inverse_transform_mem_.base_frame);
+
+ cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+ inverse_transform_mem_.cur_frame);
+ }
+
+ // These tests modify inverse_transform_mem_.
+ void TestRandomValues(int num_tests);
+ void TestDcOnlyRandomValue(int num_tests);
+
+ Array2DView<DstPixel> base_frame_buffer_;
+ Array2DView<DstPixel> cur_frame_buffer_;
+
+ Transform1dSize tx_size_1d_row_ = kTransform1dSize4;
+ Transform1dSize tx_size_1d_column_ = kTransform1dSize4;
+
+ InverseTransformAddFuncs base_inverse_transforms_;
+ InverseTransformAddFuncs cur_inverse_transforms_;
+};
+
+constexpr TransformType kLibgav1TxType[kNumTransformTypes] = {
+ kTransformTypeDctDct, kTransformTypeAdstDct,
+ kTransformTypeDctAdst, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst,
+ kTransformTypeFlipadstAdst, kTransformTypeIdentityIdentity,
+ kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+ kTransformTypeIdentityAdst, kTransformTypeAdstIdentity,
+ kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity};
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr Transform1d kRowTransform[kNumTransformTypes] = {
+ kTransform1dDct, kTransform1dAdst, kTransform1dDct,
+ kTransform1dAdst, kTransform1dAdst, kTransform1dDct,
+ kTransform1dAdst, kTransform1dAdst, kTransform1dAdst,
+ kTransform1dIdentity, kTransform1dIdentity, kTransform1dDct,
+ kTransform1dIdentity, kTransform1dAdst, kTransform1dIdentity,
+ kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr Transform1d kColumnTransform[kNumTransformTypes] = {
+ kTransform1dDct, kTransform1dDct, kTransform1dAdst,
+ kTransform1dAdst, kTransform1dDct, kTransform1dAdst,
+ kTransform1dAdst, kTransform1dAdst, kTransform1dAdst,
+ kTransform1dIdentity, kTransform1dDct, kTransform1dIdentity,
+ kTransform1dAdst, kTransform1dIdentity, kTransform1dAdst,
+ kTransform1dIdentity};
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+ BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F),
+ BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) {
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ TransformSet tx_set;
+ if (tx_size_square_max > kTransformSize32x32) {
+ tx_set = kTransformSetDctOnly;
+ } else if (tx_size_square_max == kTransformSize32x32) {
+ tx_set = kTransformSetInter3;
+ } else if (tx_size_square_max == kTransformSize16x16) {
+ tx_set = kTransformSetInter2;
+ } else {
+ tx_set = kTransformSetInter1;
+ }
+ return kTransformTypeInSetMask[tx_set].Contains(tx_type);
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
+ int num_tests) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ for (int tx_type_idx = -1; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+ const TransformType tx_type = (tx_type_idx == -1)
+ ? kTransformTypeDctDct
+ : kLibgav1TxType[tx_type_idx];
+ const Transform1d row_transform =
+ (tx_type_idx == -1) ? kTransform1dWht : kRowTransform[tx_type];
+ const Transform1d column_transform =
+ (tx_type_idx == -1) ? kTransform1dWht : kColumnTransform[tx_type];
+
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ base_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr ||
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr) {
+ continue;
+ }
+
+ // Only test valid tx_size for given tx_type. See 5.11.40.
+ if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue;
+
+ absl::Duration base_elapsed_time[2];
+ absl::Duration cur_elapsed_time[2];
+
+ for (int n = 0; n < num_tests; ++n) {
+ const int tx_height = std::min(block_height_, 32);
+ const int start_x = 0;
+ const int start_y = 0;
+
+ inverse_transform_mem_.Reset(&rnd, block_width_, block_height_);
+ memcpy(inverse_transform_mem_.base_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+ memcpy(inverse_transform_mem_.cur_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+
+ const absl::Time base_row_start = absl::Now();
+ base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+ const absl::Time cur_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+ start_x, start_y, &cur_frame_buffer_);
+ cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+ const absl::Time base_column_start = absl::Now();
+ base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+ const absl::Time cur_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+ start_x, start_y, &cur_frame_buffer_);
+ cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+ if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+ inverse_transform_mem_.cur_frame,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, false)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << ToString(
+ static_cast<Transform1dSize>(tx_size_1d_column_))
+ << " differs from reference in iteration #" << n
+ << " tx_type_idx:" << tx_type_idx;
+ break;
+ }
+ }
+
+ if (num_tests > 1) {
+ const auto base_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+ const auto cur_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+ printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n",
+ (tx_type_idx == -1) ? ToString(row_transform) : ToString(tx_type),
+ kTransform1dSizeNames[tx_size_1d_row_], base_row_elapsed_time_us,
+ cur_row_elapsed_time_us,
+ static_cast<float>(base_row_elapsed_time_us) /
+ static_cast<float>(cur_row_elapsed_time_us));
+ const auto base_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+ const auto cur_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+ printf(
+ "TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n",
+ (tx_type_idx == -1) ? ToString(column_transform) : ToString(tx_type),
+ kTransform1dSizeNames[tx_size_1d_column_],
+ base_column_elapsed_time_us, cur_column_elapsed_time_us,
+ static_cast<float>(base_column_elapsed_time_us) /
+ static_cast<float>(cur_column_elapsed_time_us));
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue(
+ int num_tests) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+ const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+ const Transform1d row_transform = kRowTransform[tx_type];
+ const Transform1d column_transform = kColumnTransform[tx_type];
+
+ if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr) {
+ continue;
+ }
+
+ // Only test valid tx_size for given tx_type. See 5.11.40.
+ if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue;
+
+ absl::Duration base_elapsed_time[2];
+ absl::Duration cur_elapsed_time[2];
+
+ for (int n = 0; n < num_tests; ++n) {
+ const int tx_height = std::min(block_height_, 32);
+ const int start_x = 0;
+ const int start_y = 0;
+
+ // Using width == 1 and height == 1 will reset only the dc value.
+ inverse_transform_mem_.Reset(&rnd, 1, 1);
+ memcpy(inverse_transform_mem_.base_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+ memcpy(inverse_transform_mem_.cur_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+
+ // For this test, the "base" contains the output when the
+ // tx_height is set to the max for the given block size. The
+ // "cur" contains the output when the passed in tx_height is 1.
+ // Compare the outputs for match.
+ const absl::Time base_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+ const absl::Time cur_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, /*adjusted_tx_height=*/1,
+ inverse_transform_mem_.cur_residual, start_x, start_y,
+ &cur_frame_buffer_);
+ cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+ const absl::Time base_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+ const absl::Time cur_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, /*adjusted_tx_height=*/1,
+ inverse_transform_mem_.cur_residual, start_x, start_y,
+ &cur_frame_buffer_);
+ cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+ if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+ inverse_transform_mem_.cur_frame,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, false)) {
+ ADD_FAILURE() << "Result from dc only version of "
+ << ToString(
+ static_cast<Transform1dSize>(tx_size_1d_column_))
+ << " differs from reference in iteration #" << n
+ << "tx_type_idx:" << tx_type_idx;
+ break;
+ }
+ }
+
+ if (num_tests > 1) {
+ const auto base_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+ const auto cur_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+ printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n",
+ ToString(tx_type), kTransform1dSizeNames[tx_size_1d_row_],
+ base_row_elapsed_time_us, cur_row_elapsed_time_us,
+ static_cast<float>(base_row_elapsed_time_us) /
+ static_cast<float>(cur_row_elapsed_time_us));
+ const auto base_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+ const auto cur_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+ printf("TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n",
+ ToString(tx_type), kTransform1dSizeNames[tx_size_1d_column_],
+ base_column_elapsed_time_us, cur_column_elapsed_time_us,
+ static_cast<float>(base_column_elapsed_time_us) /
+ static_cast<float>(cur_column_elapsed_time_us));
+ }
+ }
+}
+
+using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>;
+
+TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+constexpr TransformSize kTransformSizesAll[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp,
+ testing::ValuesIn(kTransformSizesAll));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using InverseTransformTest12bpp = InverseTransformTest<12, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest12bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest12bpp, DISABLED_Speed) { TestRandomValues(12000); }
+
+TEST_P(InverseTransformTest12bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest12bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize param) {
+ return os << ToString(param);
+}
+
+} // namespace libgav1
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_)
+ return()
+endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_
+set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1)
+
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+
+list(APPEND libgav1_dsp_sources
+ "${libgav1_source}/dsp/average_blend.cc"
+ "${libgav1_source}/dsp/average_blend.h"
+ "${libgav1_source}/dsp/cdef.cc"
+ "${libgav1_source}/dsp/cdef.h"
+ "${libgav1_source}/dsp/cdef.inc"
+ "${libgav1_source}/dsp/common.h"
+ "${libgav1_source}/dsp/constants.cc"
+ "${libgav1_source}/dsp/constants.h"
+ "${libgav1_source}/dsp/convolve.cc"
+ "${libgav1_source}/dsp/convolve.h"
+ "${libgav1_source}/dsp/convolve.inc"
+ "${libgav1_source}/dsp/distance_weighted_blend.cc"
+ "${libgav1_source}/dsp/distance_weighted_blend.h"
+ "${libgav1_source}/dsp/dsp.cc"
+ "${libgav1_source}/dsp/dsp.h"
+ "${libgav1_source}/dsp/film_grain.cc"
+ "${libgav1_source}/dsp/film_grain.h"
+ "${libgav1_source}/dsp/film_grain_common.h"
+ "${libgav1_source}/dsp/intra_edge.cc"
+ "${libgav1_source}/dsp/intra_edge.h"
+ "${libgav1_source}/dsp/intrapred_cfl.cc"
+ "${libgav1_source}/dsp/intrapred_cfl.h"
+ "${libgav1_source}/dsp/intrapred_directional.cc"
+ "${libgav1_source}/dsp/intrapred_directional.h"
+ "${libgav1_source}/dsp/intrapred_filter.cc"
+ "${libgav1_source}/dsp/intrapred_filter.h"
+ "${libgav1_source}/dsp/intrapred.cc"
+ "${libgav1_source}/dsp/intrapred.h"
+ "${libgav1_source}/dsp/intrapred_smooth.cc"
+ "${libgav1_source}/dsp/intrapred_smooth.h"
+ "${libgav1_source}/dsp/inverse_transform.cc"
+ "${libgav1_source}/dsp/inverse_transform.h"
+ "${libgav1_source}/dsp/inverse_transform.inc"
+ "${libgav1_source}/dsp/loop_filter.cc"
+ "${libgav1_source}/dsp/loop_filter.h"
+ "${libgav1_source}/dsp/loop_restoration.cc"
+ "${libgav1_source}/dsp/loop_restoration.h"
+ "${libgav1_source}/dsp/mask_blend.cc"
+ "${libgav1_source}/dsp/mask_blend.h"
+ "${libgav1_source}/dsp/motion_field_projection.cc"
+ "${libgav1_source}/dsp/motion_field_projection.h"
+ "${libgav1_source}/dsp/motion_vector_search.cc"
+ "${libgav1_source}/dsp/motion_vector_search.h"
+ "${libgav1_source}/dsp/obmc.cc"
+ "${libgav1_source}/dsp/obmc.h"
+ "${libgav1_source}/dsp/obmc.inc"
+ "${libgav1_source}/dsp/smooth_weights.inc"
+ "${libgav1_source}/dsp/super_res.cc"
+ "${libgav1_source}/dsp/super_res.h"
+ "${libgav1_source}/dsp/warp.cc"
+ "${libgav1_source}/dsp/warp.h"
+ "${libgav1_source}/dsp/weight_mask.cc"
+ "${libgav1_source}/dsp/weight_mask.h")
+
+list(APPEND libgav1_dsp_sources_avx2
+ ${libgav1_dsp_sources_avx2}
+ "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+ "${libgav1_source}/dsp/x86/cdef_avx2.h"
+ "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+ "${libgav1_source}/dsp/x86/convolve_avx2.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
+list(APPEND libgav1_dsp_sources_neon
+ ${libgav1_dsp_sources_neon}
+ "${libgav1_source}/dsp/arm/average_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/average_blend_neon.h"
+ "${libgav1_source}/dsp/arm/cdef_neon.cc"
+ "${libgav1_source}/dsp/arm/cdef_neon.h"
+ "${libgav1_source}/dsp/arm/common_neon.h"
+ "${libgav1_source}/dsp/arm/convolve_10bit_neon.cc"
+ "${libgav1_source}/dsp/arm/convolve_neon.cc"
+ "${libgav1_source}/dsp/arm/convolve_neon.h"
+ "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h"
+ "${libgav1_source}/dsp/arm/film_grain_neon.cc"
+ "${libgav1_source}/dsp/arm/film_grain_neon.h"
+ "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
+ "${libgav1_source}/dsp/arm/intra_edge_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+ "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
+ "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
+ "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+ "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+ "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
+ "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
+ "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
+ "${libgav1_source}/dsp/arm/mask_blend_neon.h"
+ "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc"
+ "${libgav1_source}/dsp/arm/motion_field_projection_neon.h"
+ "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc"
+ "${libgav1_source}/dsp/arm/motion_vector_search_neon.h"
+ "${libgav1_source}/dsp/arm/obmc_neon.cc"
+ "${libgav1_source}/dsp/arm/obmc_neon.h"
+ "${libgav1_source}/dsp/arm/super_res_neon.cc"
+ "${libgav1_source}/dsp/arm/super_res_neon.h"
+ "${libgav1_source}/dsp/arm/warp_neon.cc"
+ "${libgav1_source}/dsp/arm/warp_neon.h"
+ "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
+ "${libgav1_source}/dsp/arm/weight_mask_neon.h")
+
+list(APPEND libgav1_dsp_sources_sse4
+ ${libgav1_dsp_sources_sse4}
+ "${libgav1_source}/dsp/x86/average_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/average_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/common_sse4.h"
+ "${libgav1_source}/dsp/x86/cdef_sse4.cc"
+ "${libgav1_source}/dsp/x86/cdef_sse4.h"
+ "${libgav1_source}/dsp/x86/convolve_sse4.cc"
+ "${libgav1_source}/dsp/x86/convolve_sse4.h"
+ "${libgav1_source}/dsp/x86/convolve_sse4.inc"
+ "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.h"
+ "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
+ "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
+ "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
+ "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
+ "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
+ "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
+ "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+ "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
+ "${libgav1_source}/dsp/x86/obmc_sse4.cc"
+ "${libgav1_source}/dsp/x86/obmc_sse4.h"
+ "${libgav1_source}/dsp/x86/super_res_sse4.cc"
+ "${libgav1_source}/dsp/x86/super_res_sse4.h"
+ "${libgav1_source}/dsp/x86/transpose_sse4.h"
+ "${libgav1_source}/dsp/x86/warp_sse4.cc"
+ "${libgav1_source}/dsp/x86/warp_sse4.h"
+ "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
+ "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
+
+macro(libgav1_add_dsp_targets)
+ unset(dsp_sources)
+ list(APPEND dsp_sources ${libgav1_dsp_sources}
+ ${libgav1_dsp_sources_neon}
+ ${libgav1_dsp_sources_avx2}
+ ${libgav1_dsp_sources_sse4})
+
+ libgav1_add_library(NAME
+ libgav1_dsp
+ TYPE
+ OBJECT
+ SOURCES
+ ${dsp_sources}
+ DEFINES
+ ${libgav1_defines}
+ $<$<CONFIG:Debug>:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS>
+ INCLUDES
+ ${libgav1_include_paths})
+endmacro()
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// 7.14.6.1.
+template <int bitdepth, typename Pixel>
+struct LoopFilterFuncs_C {
+ LoopFilterFuncs_C() = delete;
+
+ static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+ static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+ static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+ static constexpr int kFlatThresh = 1 << (bitdepth - 8);
+
+ static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+};
+
+inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
+ int* const inner_thresh, int* const hev_thresh) {
+ assert(*outer_thresh >= 7 && *outer_thresh <= 3 * kMaxLoopFilterValue + 4);
+ assert(*inner_thresh >= 1 && *inner_thresh <= kMaxLoopFilterValue);
+ assert(*hev_thresh >= 0 && *hev_thresh <= 3);
+ *outer_thresh <<= bitdepth - 8;
+ *inner_thresh <<= bitdepth - 8;
+ *hev_thresh <<= bitdepth - 8;
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step];
+ return std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 2 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter2_C(Pixel* p, ptrdiff_t step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int min_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+ const int max_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+ // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332]
+ const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255]
+ const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+ const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+ p[-step] = Clip3(p0 + a2, 0, max_unsigned_val);
+ p[0] = Clip3(q0 - a1, 0, max_unsigned_val);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 4 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter4_C(Pixel* p, ptrdiff_t step) {
+ const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+ const int a = 3 * (q0 - p0);
+ const int min_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+ const int max_signed_val =
+ LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+ const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+ const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int a3 = (a1 + 1) >> 1;
+ const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+ p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val);
+ p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val);
+ p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val);
+ p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical4(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) {
+ if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal4(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) {
+ if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ return std::abs(p2 - p1) <= inner_thresh &&
+ std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(q2 - q1) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+ std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter6(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 6 pixels in, 4 pixels out.
+template <typename Pixel>
+inline void Filter6_C(Pixel* p, ptrdiff_t step) {
+ const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+ const int a1 = 2 * p1;
+ const int a0 = 2 * p0;
+ const int b0 = 2 * q0;
+ const int b1 = 2 * q1;
+ // The max is 8 * max_pixel + 4 for the rounder.
+ // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+ p[-2 * step] = ApplyFilter6<Pixel>(3 * p2 + a1 + a0 + q0);
+ p[-1 * step] = ApplyFilter6<Pixel>(p2 + a1 + a0 + b0 + q1);
+ p[0 * step] = ApplyFilter6<Pixel>(p1 + a0 + b0 + b1 + q2);
+ p[1 * step] = ApplyFilter6<Pixel>(p0 + b0 + b1 + 3 * q2);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical6(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat3(dst, 1, flat_thresh)) {
+ Filter6_C(dst, 1);
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal6(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat3(dst, stride, flat_thresh)) {
+ Filter6_C(dst, stride);
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh,
+ int inner_thresh) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ return std::abs(p3 - p2) <= inner_thresh &&
+ std::abs(p2 - p1) <= inner_thresh &&
+ std::abs(p1 - p0) <= inner_thresh &&
+ std::abs(q1 - q0) <= inner_thresh &&
+ std::abs(q2 - q1) <= inner_thresh &&
+ std::abs(q3 - q2) <= inner_thresh &&
+ std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+ std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh &&
+ std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter8(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 8 pixels in, 6 pixels out.
+template <typename Pixel>
+inline void Filter8_C(Pixel* p, ptrdiff_t step) {
+ const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+ // The max is 8 * max_pixel + 4 for the rounder.
+ // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+ p[-3 * step] = ApplyFilter8<Pixel>(3 * p3 + 2 * p2 + p1 + p0 + q0);
+ p[-2 * step] = ApplyFilter8<Pixel>(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1);
+ p[-1 * step] = ApplyFilter8<Pixel>(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2);
+ p[0 * step] = ApplyFilter8<Pixel>(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3);
+ p[1 * step] = ApplyFilter8<Pixel>(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3);
+ p[2 * step] = ApplyFilter8<Pixel>(p0 + q0 + q1 + 2 * q2 + 3 * q3);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical8(void* dest, ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, 1, flat_thresh)) {
+ Filter8_C(dst, 1);
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal8(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, stride, flat_thresh)) {
+ Filter8_C(dst, stride);
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+ const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+ return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh &&
+ std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh &&
+ std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter14(int filter_value) {
+ return static_cast<Pixel>(RightShiftWithRounding(filter_value, 4));
+}
+
+// 7.14.6.4.
+// 14 pixels in, 12 pixels out.
+template <typename Pixel>
+inline void Filter14_C(Pixel* p, ptrdiff_t step) {
+ const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+ p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+ p0 = p[-step];
+ const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step],
+ q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+ // The max is 16 * max_pixel + 8 for the rounder.
+ // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits)
+ p[-6 * step] =
+ ApplyFilter14<Pixel>(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0);
+ p[-5 * step] = ApplyFilter14<Pixel>(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 +
+ p1 + p0 + q0 + q1);
+ p[-4 * step] = ApplyFilter14<Pixel>(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 +
+ p1 + p0 + q0 + q1 + q2);
+ p[-3 * step] = ApplyFilter14<Pixel>(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 +
+ p1 * 2 + p0 + q0 + q1 + q2 + q3);
+ p[-2 * step] = ApplyFilter14<Pixel>(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+ p0 * 2 + q0 + q1 + q2 + q3 + q4);
+ p[-1 * step] = ApplyFilter14<Pixel>(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5);
+ p[0 * step] = ApplyFilter14<Pixel>(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6);
+ p[1 * step] = ApplyFilter14<Pixel>(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 * 2);
+ p[2 * step] = ApplyFilter14<Pixel>(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+ q3 * 2 + q4 + q5 + q6 * 3);
+ p[3 * step] = ApplyFilter14<Pixel>(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+ q4 * 2 + q5 + q6 * 4);
+ p[4 * step] = ApplyFilter14<Pixel>(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+ q5 * 2 + q6 * 5);
+ p[5 * step] =
+ ApplyFilter14<Pixel>(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical14(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, 1, flat_thresh)) {
+ if (IsFlatOuter4(dst, 1, flat_thresh)) {
+ Filter14_C(dst, 1);
+ } else {
+ Filter8_C(dst, 1);
+ }
+ } else if (Hev(dst, 1, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, 1);
+ } else {
+ Filter4_C<bitdepth>(dst, 1);
+ }
+ }
+ dst += stride;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal14(void* dest,
+ ptrdiff_t stride,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+ AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ for (int i = 0; i < 4; ++i) {
+ if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+ if (IsFlat4(dst, stride, flat_thresh)) {
+ if (IsFlatOuter4(dst, stride, flat_thresh)) {
+ Filter14_C(dst, stride);
+ } else {
+ Filter8_C(dst, stride);
+ }
+ } else if (Hev(dst, stride, hev_thresh)) {
+ Filter2_C<bitdepth>(dst, stride);
+ } else {
+ Filter4_C<bitdepth>(dst, stride);
+ }
+ }
+ ++dst;
+ }
+}
+
+using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>;
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs8bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs8bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void LoopFilterInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+ // Local functions that may be unused depending on the optimizations
+ // available.
+ static_cast<void>(AdjustThresholds);
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters. This function is not thread-safe.
+void LoopFilterInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_LOOP_FILTER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Horizontal and Vertical need 32x32: 8 pixels preceding filtered section
+// 16 pixels within filtered section
+// 8 pixels following filtered section
+constexpr int kNumPixels = 1024;
+constexpr int kBlockStride = 32;
+
+constexpr int kNumTests = 50000;
+constexpr int kNumSpeedTests = 500000;
+
+template <typename Pixel>
+void InitInput(Pixel* dst, const int stride, const int bitdepth,
+ libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
+ const bool transpose) {
+ const int max_pixel = (1 << bitdepth) - 1;
+ const int pixel_range = max_pixel + 1;
+ Pixel tmp[kNumPixels];
+ auto clip_pixel = [max_pixel](int val) {
+ return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0));
+ };
+
+ for (int i = 0; i < kNumPixels;) {
+ const uint8_t val = rnd.Rand8();
+ if (val & 0x80) { // 50% chance to choose a new value.
+ tmp[i++] = rnd(pixel_range);
+ } else { // 50% chance to repeat previous value in row X times.
+ int j = 0;
+ while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+ if (i < 1) {
+ tmp[i] = rnd(pixel_range);
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1));
+ } else { // Decrement by a value within the limit.
+ tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1));
+ }
+ ++i;
+ }
+ }
+ }
+
+ for (int i = 0; i < kNumPixels;) {
+ const uint8_t val = rnd.Rand8();
+ if (val & 0x80) {
+ ++i;
+ } else { // 50% chance to repeat previous value in column X times.
+ int j = 0;
+ while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+ if (i < 1) {
+ tmp[i] = rnd(pixel_range);
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+ tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1));
+ } else { // Decrement by a value within the inner_thresh.
+ tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+ tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1));
+ }
+ ++i;
+ }
+ }
+ }
+
+ for (int i = 0; i < kNumPixels; ++i) {
+ const int offset = transpose ? stride * (i % stride) + i / stride : i;
+ dst[i] = tmp[offset];
+ }
+}
+
+template <int bitdepth, typename Pixel>
+class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ LoopFilterTest() = default;
+ LoopFilterTest(const LoopFilterTest&) = delete;
+ LoopFilterTest& operator=(const LoopFilterTest&) = delete;
+ ~LoopFilterTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopFilterInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memcpy(base_loop_filters_, dsp->loop_filters[size_],
+ sizeof(base_loop_filters_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopFilterInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopFilterInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopFilterInit10bpp_NEON();
+#endif
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_loop_filters_, dsp->loop_filters[size_],
+ sizeof(cur_loop_filters_));
+
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_loop_filters_[i] == base_loop_filters_[i]) {
+ cur_loop_filters_[i] = nullptr;
+ }
+ }
+ }
+
+ // Check |digests| if non-NULL otherwise print the filter timing.
+ void TestRandomValues(const char* const digests[kNumLoopFilterTypes],
+ int num_runs) const;
+ void TestSaturatedValues() const;
+
+ const LoopFilterSize size_ = GetParam();
+ LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes];
+ LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes];
+};
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
+ const char* const digests[kNumLoopFilterTypes], const int num_runs) const {
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ if (cur_loop_filters_[i] == nullptr) continue;
+
+ libvpx_test::MD5 md5_digest;
+ absl::Duration elapsed_time;
+ for (int n = 0; n < num_runs; ++n) {
+ Pixel dst[kNumPixels];
+ const auto outer_thresh = static_cast<uint8_t>(
+ rnd(3 * kMaxLoopFilterValue - 2) + 7); // [7, 193].
+ const auto inner_thresh =
+ static_cast<uint8_t>(rnd(kMaxLoopFilterValue) + 1); // [1, 63].
+ const auto hev_thresh =
+ static_cast<uint8_t>(rnd(kMaxLoopFilterValue + 1) >> 4); // [0, 3].
+ InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
+
+ const absl::Time start = absl::Now();
+ cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride,
+ outer_thresh, inner_thresh, hev_thresh);
+ elapsed_time += absl::Now() - start;
+
+ md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst));
+ }
+ if (digests == nullptr) {
+ const auto elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+ printf("Mode %s[%25s]: %5d us\n",
+ ToString(static_cast<LoopFilterSize>(size_)),
+ ToString(static_cast<LoopFilterType>(i)), elapsed_time_us);
+ } else {
+ const std::string digest = md5_digest.Get();
+ printf("Mode %s[%25s]: MD5: %s\n",
+ ToString(static_cast<LoopFilterSize>(size_)),
+ ToString(static_cast<LoopFilterType>(i)), digest.c_str());
+ EXPECT_STREQ(digests[i], digest.c_str());
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
+ Pixel dst[kNumPixels], ref[kNumPixels];
+ const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
+ for (auto& r : dst) r = value;
+ memcpy(ref, dst, sizeof(dst));
+
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ if (cur_loop_filters_[i] == nullptr) return;
+ const int outer_thresh = 24;
+ const int inner_thresh = 8;
+ const int hev_thresh = 0;
+ cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride, outer_thresh,
+ inner_thresh, hev_thresh);
+ ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+ kBlockStride, kBlockStride, true))
+ << ToString(static_cast<LoopFilterType>(i))
+ << " output doesn't match reference";
+ }
+}
+
+//------------------------------------------------------------------------------
+
+using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
+
+const char* const* GetDigests8bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "6ba725d697d6209cb36dd199b8ffb47a",
+ "7dbb20e456ed0501fb4e7954f49f5e18",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "89bb757faa44298b7f6e9c1a67f455a5",
+ "be75d5a2fcd83709ff0845f7d83f7006",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "b09137d68c7b4f8a8a15e33b4b69828f",
+ "ef8a7f1aa073805516d3518a82a5cfa4",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "6a7bc061ace0888275af88093f82ca08",
+ "a957ddae005839aa41ba7691788b01e4",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest8bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest8bpp, FixedInput) {
+ TestRandomValues(GetDigests8bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); }
+
+constexpr LoopFilterSize kLoopFilterSizes[] = {
+ kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14};
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
+
+const char* const* GetDigests10bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "72e75c478bb130ff1ebfa75f3a70b1a2",
+ "f32d67b611080e0bf1a9d162ff47c133",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "8aec73c60c87ac7cc6bc9cc5157a2795",
+ "0e4385d3a0cbb2b1551e05ad2b0f07fb",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "85cb2928fae43e1a27b2fe1b78ba7534",
+ "d044fad9d7c64b93ecb60c88ac48e55f",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "ebca95ec0db6efbac7ff7cbeabc0e6d0",
+ "754ffaf0ac26a5953a029653bb5dd275",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest10bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest10bpp, FixedInput) {
+ TestRandomValues(GetDigests10bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest10bpp, SaturatedValues) { TestSaturatedValues(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using LoopFilterTest12bpp = LoopFilterTest<12, uint16_t>;
+
+const char* const* GetDigests12bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "a14599cbfe2daee633d556a15c47b1f6",
+ "1f0a0794832de1012e2fed6b1cb02e69",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "c76b24a73139239db10f16f36e01a625",
+ "3f75d904e9dcb1886e84a0f03f60f31e",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "57c6f0efe2ab3957f5500ca2a9670f37",
+ "caa1f90c2eb2b65b280d678f8fcf6be8",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "0c58f7466c36c3f4a2c1b4aa1b80f0b3",
+ "63077978326e6dddb5b2c3bfe6d684f5",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest12bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest12bpp, FixedInput) {
+ TestRandomValues(GetDigests12bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest12bpp, SaturatedValues) { TestSaturatedValues(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest12bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) {
+ return os << ToString(size);
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.17.3.
+// a2: range [1, 256].
+// if (z >= 255)
+// a2 = 256;
+// else if (z == 0)
+// a2 = 1;
+// else
+// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// ma = 256 - a2;
+alignas(16) const uint8_t kSgrMaLookup[256] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+ 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
+ 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0};
+
+namespace {
+
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+ const int width, const int height,
+ const int16_t* const filter,
+ const int number_zero_coefficients,
+ int16_t** wiener_buffer) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int offset =
+ 1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ for (int y = 0; y < height; ++y) {
+ int x = 0;
+ do {
+ // sum fits into 16 bits only when bitdepth = 8.
+ int sum = 0;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum +=
+ filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
+ }
+ sum += filter[kCenterTap] * source[x + kCenterTap];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+ (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
+ } while (++x != width);
+ source += source_stride;
+ *wiener_buffer += width;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const int16_t* wiener_buffer, const int width,
+ const int height, const int16_t* const filter,
+ const int number_zero_coefficients, void* const dest,
+ const ptrdiff_t dest_stride) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ constexpr int kRoundBitsVertical =
+ (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+ auto* dst = static_cast<Pixel*>(dest);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ // sum needs 32 bits.
+ int sum = 0;
+ for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+ sum += filter[k] *
+ (wiener_buffer[k * width + x] +
+ wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
+ }
+ sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+ const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+ } while (++x != width);
+ wiener_buffer += width;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+// Note: bit range for wiener filter.
+// Wiener filter process first applies horizontal filtering to input pixels,
+// followed by rounding with predefined bits (dependent on bitdepth).
+// Then vertical filtering is applied, followed by rounding (dependent on
+// bitdepth).
+// The process is the same as convolution:
+// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
+// --> <rounding 1>
+// By design:
+// (a). horizontal/vertical filtering adds 7 bits to input.
+// (b). The output of first rounding fits into 16 bits.
+// (c). The output of second rounding fits into 16 bits.
+// If input bitdepth > 8, the accumulator of the horizontal filter is larger
+// than 16 bit and smaller than 32 bits.
+// The accumulator of the vertical filter is larger than 16 bits and smaller
+// than 32 bits.
+// Note: range of wiener filter coefficients.
+// Wiener filter coefficients are symmetric, and their sum is 1 (128).
+// The range of each coefficient:
+// filter[0] = filter[6], 4 bits, min = -5, max = 10.
+// filter[1] = filter[5], 5 bits, min = -23, max = 8.
+// filter[2] = filter[4], 6 bits, min = -17, max = 46.
+// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
+// The difference from libaom is that in libaom:
+// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
+// Thus in libaom's computation, an offset of 128 is needed for filter[3].
+template <int bitdepth, typename Pixel>
+void WienerFilter_C(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ constexpr int kCenterTap = kWienerFilterTaps / 2;
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
+
+ // horizontal filtering.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+ const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+ const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
+ auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
+
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 0, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 0, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 0,
+ &wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 1, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 1, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 1,
+ &wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 2,
+ &wiener_buffer);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 3,
+ &wiener_buffer);
+ }
+
+ // vertical filtering.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer, wiener_buffer - width,
+ sizeof(*wiener_buffer) * width);
+ memcpy(wiener_buffer_org, wiener_buffer_org + width,
+ sizeof(*wiener_buffer) * width);
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 0, dest, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 1, dest, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 2, dest, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+ filter_vertical, 3, dest, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+ const int height, const int width,
+ uint16_t* const* sums,
+ uint32_t* const* square_sums) {
+ int y = height;
+ do {
+ uint32_t sum = 0;
+ uint32_t square_sum = 0;
+ for (int dx = 0; dx < size; ++dx) {
+ const Pixel source = src[dx];
+ sum += source;
+ square_sum += source * source;
+ }
+ (*sums)[0] = sum;
+ (*square_sums)[0] = square_sum;
+ int x = 1;
+ do {
+ const Pixel source0 = src[x - 1];
+ const Pixel source1 = src[x - 1 + size];
+ sum -= source0;
+ sum += source1;
+ square_sum -= source0 * source0;
+ square_sum += source1 * source1;
+ (*sums)[x] = sum;
+ (*square_sums)[x] = square_sum;
+ } while (++x != width);
+ src += src_stride;
+ ++sums;
+ ++square_sums;
+ } while (--y != 0);
+}
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+ const int height, const int width,
+ uint16_t* const* sum3, uint16_t* const* sum5,
+ uint32_t* const* square_sum3,
+ uint32_t* const* square_sum5) {
+ int y = height;
+ do {
+ uint32_t sum = 0;
+ uint32_t square_sum = 0;
+ for (int dx = 0; dx < 4; ++dx) {
+ const Pixel source = src[dx];
+ sum += source;
+ square_sum += source * source;
+ }
+ int x = 0;
+ do {
+ const Pixel source0 = src[x];
+ const Pixel source1 = src[x + 4];
+ sum -= source0;
+ square_sum -= source0 * source0;
+ (*sum3)[x] = sum;
+ (*square_sum3)[x] = square_sum;
+ sum += source1;
+ square_sum += source1 * source1;
+ (*sum5)[x] = sum + source0;
+ (*square_sum5)[x] = square_sum + source0 * source0;
+ } while (++x != width);
+ src += src_stride;
+ ++sum3;
+ ++sum5;
+ ++square_sum3;
+ ++square_sum5;
+ } while (--y != 0);
+}
+
+template <int bitdepth, int n>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+ const uint32_t b, uint8_t* const ma_ptr,
+ uint32_t* const b_ptr) {
+ // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+ // since max bitdepth = 12, max < 2^31.
+ // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+ a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+ // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+ // d < 2^8 * n < 2^14 regardless of bitdepth
+ const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+ // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // This bound on p is due to:
+ // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+ // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+ // This is an artifact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+ // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+ // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+ // ma: range [0, 255].
+ const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
+ const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ // ma < 2^8, b < 2^(bitdepth) * n,
+ // one_over_n = round(2^12 / n)
+ // => the product here is < 2^(20 + bitdepth) <= 2^32,
+ // and b is set to a value < 2^(8 + bitdepth).
+ // This holds even with the rounding in one_over_n and in the overall result,
+ // as long as ma is strictly less than 2^8.
+ const uint32_t b2 = ma * b * one_over_n;
+ *ma_ptr = ma;
+ *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
+template <typename T>
+inline uint32_t Sum343(const T* const src) {
+ return 3 * (src[0] + src[2]) + 4 * src[1];
+}
+
+template <typename T>
+inline uint32_t Sum444(const T* const src) {
+ return 4 * (src[0] + src[1] + src[2]);
+}
+
+template <typename T>
+inline uint32_t Sum565(const T* const src) {
+ return 5 * (src[0] + src[2]) + 6 * src[1];
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
+ uint16_t* const ma565, uint32_t* const b565) {
+ int x = 0;
+ do {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dy = 0; dy < 5; ++dy) {
+ a += square_sum5[dy][x];
+ b += sum5[dy][x];
+ }
+ CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
+ sgr_buffer->b + x);
+ } while (++x != width + 2);
+ x = 0;
+ do {
+ ma565[x] = Sum565(sgr_buffer->ma + x);
+ b565[x] = Sum565(sgr_buffer->b + x);
+ } while (++x != width);
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
+ const int width, const uint32_t s, const bool calculate444,
+ SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
+ uint16_t* const ma444, uint32_t* const b444) {
+ int x = 0;
+ do {
+ uint32_t a = 0;
+ uint32_t b = 0;
+ for (int dy = 0; dy < 3; ++dy) {
+ a += square_sum3[dy][x];
+ b += sum3[dy][x];
+ }
+ CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
+ sgr_buffer->b + x);
+ } while (++x != width + 2);
+ x = 0;
+ do {
+ ma343[x] = Sum343(sgr_buffer->ma + x);
+ b343[x] = Sum343(sgr_buffer->b + x);
+ } while (++x != width);
+ if (calculate444) {
+ x = 0;
+ do {
+ ma444[x] = Sum444(sgr_buffer->ma + x);
+ b444[x] = Sum444(sgr_buffer->b + x);
+ } while (++x != width);
+ }
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
+ const uint32_t b, const int shift) {
+ const int32_t v = b - ma * src;
+ return RightShiftWithRounding(v,
+ kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+ const uint16_t* const ma565[2],
+ const uint32_t* const b565[2],
+ const ptrdiff_t x, int p[2]) {
+ p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
+ b565[0][x] + b565[1][x], 5);
+ p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+ const uint16_t* const ma444,
+ const uint32_t* const b343[3],
+ const uint32_t* const b444, const ptrdiff_t x) {
+ const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
+ const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
+ return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedFinal(const int src, const int v) {
+ // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+ // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+ // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
+ // maximum value of each element.
+ const int s = src + RightShiftWithRounding(
+ v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
+ const int filter1, const int16_t w0,
+ const int16_t w2) {
+ const int v = w0 * filter0 + w2 * filter1;
+ return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
+ const int16_t w0) {
+ const int v = w0 * filter;
+ return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width,
+ const uint32_t scale, const int16_t w0,
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma565[2], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[1], b565[1]);
+ int x = 0;
+ do {
+ int p[2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+ dst[stride + x] =
+ SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+ const int width, const uint16_t scale,
+ const int16_t w0, uint16_t* const sum3[4],
+ uint32_t* const square_sum3[4],
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint32_t* const b343[4], uint32_t* const b444[3],
+ Pixel* dst) {
+ BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ int x = 0;
+ do {
+ const int p =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], const int width,
+ const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[1], b565[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+ true, sgr_buffer, ma343[3], b343[3], ma444[2],
+ b444[2]);
+ int x = 0;
+ do {
+ int p[2][2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+ p[1][0] =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+ b343 + 1, b444[1], x);
+ dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+ p[1][0], w0, w2);
+ dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+ src[stride + x], p[0][1], p[1][1], w0, w2);
+ } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+ square_sum3, square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+ square_sum5 + 4);
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[0], b565[0]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+ sgr_buffer, ma343[0], b343[0], nullptr,
+ nullptr);
+ BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+ true, sgr_buffer, ma343[1], b343[1], ma444[0],
+ b444[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = bottom_border_stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+ width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[1], b565[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+ sgr_buffer, ma343[2], b343[2], nullptr,
+ nullptr);
+ int x = 0;
+ do {
+ const int p0 = CalculateFilteredOutput<Pixel>(
+ src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+ const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+ b444[0], x);
+ dst[x] =
+ SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
+ } while (++x != width);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+ square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+ square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = bottom_border_stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+ 1, width + 2, sum5 + 3, square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[1], b565[1]);
+ int x = 0;
+ do {
+ const int p = CalculateFilteredOutput<Pixel>(
+ src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+ } while (++x != width);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t sum_stride = temp_stride + 8;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+ square_sum3);
+ BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+ sgr_buffer, ma343[0], b343[0], nullptr,
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const Pixel* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[1], b343[1], ma444[0],
+ b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+ sum3, square_sum3, sgr_buffer, ma343, ma444,
+ b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+ square_sum3, sgr_buffer, ma343, ma444, b343,
+ b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilter_C(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* src = static_cast<const Pixel*>(source);
+ const auto* top = static_cast<const Pixel*>(top_border);
+ const auto* bottom = static_cast<const Pixel*>(bottom_border);
+ auto* dst = static_cast<Pixel*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2<bitdepth, Pixel>(
+ restoration_info, src - 2, stride, top - 2, top_border_stride,
+ bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void LoopRestorationInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_restoration_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
+#include "src/dsp/x86/loop_restoration_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+extern const uint8_t kSgrMaLookup[256];
+
+// Initializes Dsp::loop_restorations. This function is not thread-safe.
+void LoopRestorationInit_C();
+
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+ T* const p0 = p[0];
+ p[0] = p[1];
+ p[1] = p[2];
+ p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+ std::swap(p[0], p[2]);
+ std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+ T* const p0 = p[0];
+ T* const p1 = p[1];
+ p[0] = p[2];
+ p[1] = p[3];
+ p[2] = p[4];
+ p[3] = p0;
+ p[4] = p1;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// in unit of Pixel.
+constexpr int kBorder = 16;
+constexpr int kWidth = 256;
+constexpr int kHeight = 255;
+constexpr int kStride = kWidth + 2 * kBorder;
+constexpr int kOffset = kBorder * kStride + kBorder;
+constexpr int kMaxBlockSize = 288 * kStride;
+constexpr int kUnitWidths[] = {32, 64, 128, 256};
+
+constexpr int kNumRadiusTypes = 3;
+constexpr int kNumWienerOrders = 4;
+constexpr int kWienerOrders[] = {7, 5, 3, 1};
+constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0};
+
+template <int bitdepth, typename Pixel>
+class SelfGuidedFilterTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ SelfGuidedFilterTest() = default;
+ SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
+ SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
+ ~SelfGuidedFilterTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopRestorationInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_NEON();
+#endif
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ target_self_guided_filter_func_ = dsp->loop_restorations[1];
+ restoration_info_.type = kLoopRestorationTypeSgrProj;
+ memset(dst_, 0, sizeof(dst_));
+ }
+
+ void SetInputData(int type, Pixel value, int radius_index,
+ libvpx_test::ACMRandom* rnd);
+ void TestFixedValues(int test_index, Pixel value);
+ void TestRandomValues(bool speed);
+
+ protected:
+ const int unit_width_ = GetParam();
+ const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+ alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+ RestorationUnitInfo restoration_info_;
+ RestorationBuffer restoration_buffer_;
+ LoopRestorationFunc target_self_guided_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
+ int type, Pixel value, int radius_index,
+ libvpx_test::ACMRandom* const rnd) {
+ const int mask = (1 << bitdepth) - 1;
+ if (type == 0) { // Set fixed values
+ for (auto& s : src_) s = value;
+ } else { // Set random values
+ for (auto& s : src_) s = rnd->Rand16() & mask;
+ }
+ for (auto& d : dst_) d = rnd->Rand16() & mask;
+ restoration_info_.sgr_proj_info.multiplier[0] =
+ kSgrProjMultiplierMin[0] +
+ rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] +
+ 1);
+ restoration_info_.sgr_proj_info.multiplier[1] =
+ kSgrProjMultiplierMin[1] +
+ rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] +
+ 1);
+ // regulate multiplier so that it matches libaom.
+ // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time.
+ // When r0 or r1 is zero, its corresponding multiplier is set to zero in
+ // libaom.
+ int index;
+ if (radius_index == 0) {
+ index = 0; // r0 = 2, r1 = 1
+ } else if (radius_index == 1) {
+ index = 10; // r0 = 0, r1 = 1
+ } else /* if (radius_index == 2) */ {
+ index = 14; // r0 = 2, r1 = 0
+ }
+ const uint8_t r0 = kSgrProjParams[index][0];
+ const uint8_t r1 = kSgrProjParams[index][2];
+ static constexpr int kMultiplier[2] = {0, 95};
+ restoration_info_.sgr_proj_info.index = index;
+ if (r0 == 0) {
+ restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0];
+ } else if (r1 == 0) {
+ restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1];
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
+ Pixel value) {
+ static const char* const kDigest[][3][kNumRadiusTypes] = {
+ {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
+ "a03314fc210bee68c7adbb44d2bbdac7"},
+ {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
+ "a6583fe9359877f4a259c81d900fc4fb"},
+ {"8f9b6944c8965f34d444a667da3b0ebe", "84fa62c491c67c3a435fd5140e7a4f82",
+ "d04b62d97228789e5c6928d40d5d900e"}},
+ {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
+ "27100f37b3e42a5f2a051e1566edb6f8"},
+ {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
+ "69c274ac59c99999e1bfbf2fc4586ebd"},
+ {"86ff2318bf8a584b8d5edd710681d621", "f6e1c104a764d6766cc278d5b216855a",
+ "6d928703526ab114efba865ff5b11886"}},
+ {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
+ "92f31086ba2f9e1508983b22d93a4e5c"},
+ {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
+ "43dd7df2c2a601262c68cd8af1c61b82"},
+ {"1ab6138c3a82ac8ccd840f0553fdfb58", "be3bf92633f7165d3ad9c327d2dd53fe",
+ "41115efff3adeb541e04db23faa22f23"}},
+ {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
+ "f8a6a025827f29f857bed3e28ba3ea33"},
+ {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
+ "20dcbe8e317a4373bebf11d56adc5f02"},
+ {"7971a60337fcdb662c92db051bd0bb41", "75f89f346c2a37bf0c6695c0482531e6",
+ "1595eeacd62cdce4d2fb094534c22c1e"}}};
+ if (target_self_guided_filter_func_ == nullptr) return;
+ ASSERT_LT(value, 1 << bitdepth);
+ constexpr int bd_index = (bitdepth - 8) / 2;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+ SetInputData(0, value, radius_index, &rnd);
+ const absl::Time start = absl::Now();
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_self_guided_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+ kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride,
+ kHeight * kStride * sizeof(*dst_), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+ static const char* const kDigest[][3][kNumRadiusTypes] = {
+ {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
+ "ca67159cd29475ac5d52ca4a0df3ea10"},
+ {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
+ "a8ba988283d9e1ad1f0dcdbf6bbdaade"},
+ {"d95e98d031f9ba290e5183777d1e4905", "f806853cfadb50e6dbd4898412b92934",
+ "741fbfdb79cda695afedda3d51dbb27f"}},
+ {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
+ "a4005899fa8d3c3c4669910f93ff1290"},
+ {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
+ "07203ad761775d5d317f2b7884afd9fe"},
+ {"76b9ef906090fa81af64cce3bba0a54a", "8eecc59acdef8953aa9a96648c0ccd2c",
+ "6e45a0ef60e0475f470dc93552047f07"}},
+ {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
+ "475bcb6a58f87da7723f6227bc2aca0e"},
+ {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
+ "7cb5c5dbdb3d1c54cfa00def450842dc"},
+ {"0e3dc23150d18c9d366d15e174727311", "8495122917770d822f1842ceff987b03",
+ "4aeb9db902072cefd6af0aff8aaabd24"}},
+ {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
+ "f1eda6d15b37172199d9949c2315832f"},
+ {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
+ "b23dc0b54c3500248d53377030428a61"},
+ {"9c331f2b9410354685fe904f6c022dfa", "b540b0045b7723fbe962fd675db4b077",
+ "3cecd1158126c9c9cc2873ecc8c1a135"}},
+ {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
+ "23966cba3e0e7803eeb951905861e0dd"},
+ {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
+ "dcee48f94126a2132963e86e93dd4903"},
+ {"beb3dd8a2dbc5f83ef171b0ffcead3ab", "c373bd9c46bdb89a3d1e41759c315025",
+ "cd407b212ab46fd4a451d5dc93a0ce4a"}}};
+ if (target_self_guided_filter_func_ == nullptr) return;
+ constexpr int bd_index = (bitdepth - 8) / 2;
+ const int num_inputs = speed ? 1 : 5;
+#if LIBGAV1_ENABLE_NEON
+ const int num_tests = speed ? 4000 : 1;
+#else
+ const int num_tests = speed ? 10000 : 1;
+#endif
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (int i = 0; i < num_inputs; ++i) {
+ for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+ SetInputData(1, 0, radius_index, &rnd);
+ const absl::Time start = absl::Now();
+ for (int k = 0; k < num_tests; ++k) {
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_self_guided_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+ kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride,
+ kHeight * kStride * sizeof(*dst_), elapsed_time);
+ }
+ }
+}
+
+using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>;
+
+TEST_P(SelfGuidedFilterTest8bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 128);
+ TestFixedValues(3, 255);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest10bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 512);
+ TestFixedValues(3, 1023);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SelfGuidedFilterTest12bpp = SelfGuidedFilterTest<12, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest12bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 2048);
+ TestFixedValues(3, 4095);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest12bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class WienerFilterTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ WienerFilterTest() = default;
+ WienerFilterTest(const WienerFilterTest&) = delete;
+ WienerFilterTest& operator=(const WienerFilterTest&) = delete;
+ ~WienerFilterTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopRestorationInit_C();
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_wiener_filter_func_ = dsp->loop_restorations[0];
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) == 0) GTEST_SKIP() << "No AVX2 support!";
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_NEON();
+#endif
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ target_wiener_filter_func_ = dsp->loop_restorations[0];
+ restoration_info_.type = kLoopRestorationTypeWiener;
+ memset(dst_, 0, sizeof(dst_));
+ memset(tmp_, 0, sizeof(tmp_));
+ memset(buffer_, 0, sizeof(buffer_));
+ }
+
+ static void CleanFilterByOrder(const int order,
+ int16_t filter[kWienerFilterTaps]) {
+ if (order <= 5) filter[0] = 0;
+ if (order <= 3) filter[1] = 0;
+ if (order <= 1) filter[2] = 0;
+ }
+
+ void SetInputData(int type, Pixel value, int vertical_order,
+ int horizontal_order);
+ void TestFixedValues(int digest_id, Pixel value);
+ void TestRandomValues(bool speed);
+ void TestCompare2C();
+
+ protected:
+ const int unit_width_ = GetParam();
+ const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+ alignas(kMaxAlignment)
+ uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) *
+ kRestorationUnitHeight];
+ alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize];
+ RestorationUnitInfo restoration_info_;
+ RestorationBuffer restoration_buffer_;
+ LoopRestorationFunc base_wiener_filter_func_;
+ LoopRestorationFunc target_wiener_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::SetInputData(
+ int type, Pixel value, const int vertical_order,
+ const int horizontal_order) {
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ if (type == 0) {
+ for (auto& s : src_) s = value;
+ } else {
+ for (auto& s : src_) s = rnd.Rand16() & mask;
+ }
+ int order = vertical_order;
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ auto& filter = restoration_info_.wiener_info.filter[i];
+ filter[3] = 128;
+ for (int j = 0; j < 3; ++j) {
+ filter[j] = kWienerTapsMin[j] +
+ rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1);
+ }
+ CleanFilterByOrder(order, filter);
+ filter[3] -= 2 * (filter[0] + filter[1] + filter[2]);
+ restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+ (kWienerFilterTaps - order) / 2;
+ order = horizontal_order;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
+ Pixel value) {
+ static const char* const kDigest[3][4] = {
+ {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
+ "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
+ {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+ "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"},
+ {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+ "961eeb92bd9d85eb47e3961ee93d279a", "039a279232bc90eebc0ec2fe3e18a7e1"},
+ };
+ if (target_wiener_filter_func_ == nullptr) return;
+ ASSERT_LT(value, 1 << bitdepth);
+ constexpr int bd_index = (bitdepth - 8) / 2;
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(0, value, vertical_order, horizontal_order);
+ memset(dst_, 0, sizeof(dst_));
+ const absl::Time start = absl::Now();
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_wiener_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+ kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time);
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+ static const char* const kDigest[3][kNumWienerOrders][kNumWienerOrders] = {
+ {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
+ "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
+ {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
+ "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"},
+ {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59",
+ "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"},
+ {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c",
+ "52a673b6d14fbdca5ebdb1a34ee3326f",
+ "ebb42c7c9111f2e39f21e2158e801d9e"}},
+ {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196",
+ "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"},
+ {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c",
+ "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"},
+ {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068",
+ "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
+ {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
+ "7cbc1562a9dd08e1973b3b9ac1afc765",
+ "3c91bf1a34672cd40bf261c5820d3ec3"}},
+ {{"501b57370c781372b514accd03d161af", "a4569b5eff7f7e8b696934d192619be5",
+ "24eb2aa43118a8822f7a6a7384ab9ea7", "edd7ac227733b5a4496bfdbdf4eb34d7"},
+ {"77624cf73299a1bd928eae3eb8945dbe", "b3f311cacbf45fa892761462d31b2598",
+ "977c063d93a4b95cb365363763faa4da", "02313c9d360a1e0180ed05d3e4444c3d"},
+ {"f499655ecdcbe0ac48553f1eee758589", "a009c83c03e47cbd05c1243e28579bd9",
+ "d5f0b4fd761ff51efce949e6c5ec4833", "e3a9a57aacd2e6cfe0f792a885b3e0e3"},
+ {"b4cf906e9bb02ffca15c1e9575962ca2", "d0ca9f933978c0c31175ba1b28a44ae8",
+ "81ac1475530ffbd1c8d3ce7da87ffe6b",
+ "b96412949c2e31b29388222ac8914fa2"}},
+ };
+ if (target_wiener_filter_func_ == nullptr) return;
+ constexpr int bd_index = (bitdepth - 8) / 2;
+#if LIBGAV1_ENABLE_NEON
+ const int num_tests = speed ? 5000 : 1;
+#else
+ const int num_tests = speed ? 10000 : 1;
+#endif
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+ memset(dst_, 0, sizeof(dst_));
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_tests; ++i) {
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_wiener_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+ kDigest[bd_index][kWienerOrderIdLookup[vertical_order]]
+ [kWienerOrderIdLookup[horizontal_order]],
+ dst_, sizeof(dst_), elapsed_time);
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
+ if (base_wiener_filter_func_ == nullptr) return;
+ if (target_wiener_filter_func_ == nullptr) return;
+ if (base_wiener_filter_func_ == target_wiener_filter_func_) return;
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ Pixel* const tmp = tmp_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+ for (int x = 0; x < 2; ++x) {
+ // Prepare min/max filter coefficients.
+ int order = vertical_order;
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ auto& filter = restoration_info_.wiener_info.filter[i];
+ for (int j = 0; j < 3; ++j) {
+ filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j];
+ }
+ CleanFilterByOrder(order, filter);
+ filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]);
+ restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+ (kWienerFilterTaps - order) / 2;
+ order = horizontal_order;
+ }
+ base_wiener_filter_func_(restoration_info_, src, kStride,
+ src - kRestorationVerticalBorder * kStride,
+ kStride, src + unit_height_ * kStride, kStride,
+ unit_width_, unit_height_,
+ &restoration_buffer_, dst);
+ target_wiener_filter_func_(restoration_info_, src, kStride,
+ src - kRestorationVerticalBorder * kStride,
+ kStride, src + unit_height_ * kStride,
+ kStride, unit_width_, unit_height_,
+ &restoration_buffer_, tmp);
+ if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
+ kStride, kStride, false, true)) {
+ ADD_FAILURE() << "Mismatch -- wiener taps min/max";
+ }
+ }
+ }
+ }
+}
+
+using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>;
+
+TEST_P(WienerFilterTest8bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 128);
+ TestFixedValues(3, 255);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>;
+
+TEST_P(WienerFilterTest10bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 512);
+ TestFixedValues(3, 1023);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WienerFilterTest12bpp = WienerFilterTest<12, uint16_t>;
+
+TEST_P(WienerFilterTest12bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 2048);
+ TestFixedValues(3, 4095);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest12bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest12bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+uint8_t GetMaskValue(const uint8_t* LIBGAV1_RESTRICT mask,
+ const uint8_t* LIBGAV1_RESTRICT mask_next_row, int x,
+ int subsampling_x, int subsampling_y) {
+ if ((subsampling_x | subsampling_y) == 0) {
+ return mask[x];
+ }
+ if (subsampling_x == 1 && subsampling_y == 0) {
+ return static_cast<uint8_t>(RightShiftWithRounding(
+ mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1));
+ }
+ assert(subsampling_x == 1 && subsampling_y == 1);
+ return static_cast<uint8_t>(RightShiftWithRounding(
+ mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] +
+ mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1],
+ 2));
+}
+
+template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
+ int subsampling_y>
+void MaskBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int width, const int height,
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) {
+ static_assert(!(bitdepth == 8 && is_inter_intra), "");
+ assert(mask != nullptr);
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ auto* dst = static_cast<Pixel*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+ constexpr int step_y = subsampling_y ? 2 : 1;
+ const uint8_t* mask_next_row = mask + mask_stride;
+ // 7.11.3.2 Rounding variables derivation process
+ // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+ constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value =
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+ if (is_inter_intra) {
+ dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+ } else {
+ assert(prediction_stride_1 == width);
+ int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+ res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst[x] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ (1 << bitdepth) - 1));
+ }
+ }
+ dst += dst_stride;
+ mask += mask_stride * step_y;
+ mask_next_row += mask_stride * step_y;
+ pred_0 += width;
+ pred_1 += prediction_stride_1;
+ }
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_C(const uint8_t* LIBGAV1_RESTRICT prediction_0,
+ uint8_t* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int width,
+ const int height) {
+ assert(mask != nullptr);
+ constexpr int step_y = subsampling_y ? 2 : 1;
+ const uint8_t* mask_next_row = mask + mask_stride;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value =
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+ prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
+ mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
+ 6));
+ }
+ mask += mask_stride * step_y;
+ mask_next_row += mask_stride * step_y;
+ prediction_0 += width;
+ prediction_1 += prediction_stride_1;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->mask_blend[0][1] = nullptr;
+ dsp->mask_blend[1][1] = nullptr;
+ dsp->mask_blend[2][1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+#endif
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+ dsp->mask_blend[0][1] = nullptr;
+ dsp->mask_blend[1][1] = nullptr;
+ dsp->mask_blend[2][1] = nullptr;
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#endif
+ static_cast<void>(GetMaskValue);
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+ dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+ dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+ dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+ dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+ dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+ dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+#endif
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+ dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+ dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+ dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444
+ dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422
+ dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420
+ dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+#endif
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void MaskBlendInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_
+#define LIBGAV1_SRC_DSP_MASK_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/mask_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/mask_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This
+// function is not thread-safe.
+void MaskBlendInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MASK_BLEND_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+// mask_blend is applied to compound prediction values when is_inter_intra is
+// false. This implies a range far exceeding that of pixel values. The ranges
+// include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
+ "2cd162cebf99724a3fc22d501bd8c8e4", "c490478208374a43765900ef7115c264",
+ "b98f222eb70ef8589da2d6c839ca22b8", "54752ca05f67b5af571bc311aa4e3de3",
+ "5ae48814dd285bfca4f5ee8e339dca99", "383f3f4f47563f065d1b6068e5931a24",
+ "344b2dab7accd8bd0a255bee16207336", "0b2f6f755d1547eea7e0172f8133ea01",
+ "310dc6364fdacba186c01f0e8ac4fcb7", "c2ee4673078d34971319c77ca77b23d1",
+ "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
+ "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
+ "4346c2860b23f0072b6b288f14c1df36", "1cdace53543063e129a125c4084ca5d7",
+ "1ae5328e0c0f4f2bec640d1af03b2978", "3860e040fbee0c5f68f0b4af769209b3",
+ "e9480ded15d9c38ee19bf5fa816dd296", "4e17c222b64f428df29938a8120ca256",
+ "2a943bc6de9b29c8bcae189ad3bec276", "b5a6bc02c76fa61040678fb2c6c112d2",
+ "2c11bb9bd29c5577194edb77cfd1c614", "31ed1832810ae385f4ad8f57795dde1e",
+ "eb87d647839c33984dfb25bac0e7cdb3", "f652ec2b1478e35acb19cf28042ee849",
+ "0cfb18ac0cb94af1447bcac32ac20c36", "e152bbbf5ee4b40b7b41ec1f2e901aaa",
+ "f17f78fd485f7beafa8126c1cda801d7", "9f9fbee0cc9d99435efd3dff644be273",
+ "9b498843d66440c1e68dc7ab04f57d42", "2f2b0beceb31b79ccb9179991629e4b8",
+ "e06a6ebb6791529bb23fe5b0a9914220", "2b3d1ff19812a17c17b1be1f1727815e",
+ "d0bbdecec414950ed63a8a35c2bae397", "8e53906c6513058d7f17013fe0d32bf1",
+ "be0690efd31f0bf3c2adcd27ca011ed5", "c2b26243c5f147fdeadf52735aa68fb5",
+ "94bb83e774d9189c5ee04fb361855e19", "dad6441e723791a91f31a56b2136fd33",
+ "10ccac76a2debb842a0685a527b6a659", "346fb0a4914b64dda3ca0f521412b999",
+ "d7e400b855502bbb4f2b8294e207bb96", "3487503f2d73ec52f25b5e8d06c81da4",
+ "3f49c096acfcf46d44ce18b48debca7c", "8ed6a745a2b5457ac7f3ac145ce57e72",
+ "21f9dda5ef934a5ee6274b22cc22f93b", "507b60611afeb373384d9b7606f7ea46",
+ "ac766fadcdb85a47ad14a6846b9e5c36", "fde149bc2162e02bbc5fa85cc41641a5",
+ "f5f094b5742d0a920ba734b017452d24", "c90d06b0c76a0983bd1428df2a1b64b3",
+ "3649e6a6ed9f69e3f78e0b75160fb82a", "1d44b7649497e651216db50d325e3073",
+ "948fa112e90e3ca4d15f3d2f2acfab9a", "9bb54c0f7d07c0b44c44ba09379a04ff",
+ "228261ab6f098f489a8968cff1e1f7ae", "5e128db7462164f7327d1d8feeb2e4c7",
+ "9e8b97f6d9d482d5770b138bd1077747", "81563d505a4e8dd779a089abf2a28b77",
+ "b7157451de7cfa161dff1afd7f9b8622", "6a25cc0a4aaf8a315d1158dbb0ec2966",
+ "303867ee010ba51da485ee10149c6f9b", "63b64b7527d2476e9ae5139b8166e8c9",
+ "cfa93c2aeeb27a1190a445a6fee61e15", "804bcff8709665eed6830e24346101be",
+ "829947ed3e90776cda4ae82918461497", "1df10a1cb80c1a81f521e7e0f80b4f99",
+ "3c9593e42ac574f3555bb8511d438a54", "eecef71492c0626685815e646f728f79",
+ "0c43d59f456ddca2449e016ae4e34be7", "207d4ac2579f1271fc9eca8d743917b3",
+ "3c472bb0b1c891ffda19077ebb659e48", "a4ae7a0d25113bc0238fa27409f9c0dd",
+ "e8ad037ca81f46774bb01d20f46671ce", "b22741e4fe0e4062e40a2decec102ffd",
+ "c72f9e7bc0170163cb94da0faa0d3ffb", "accaf5d475d155cbd3a8c113f90718bc",
+ "2fd31e72444ea258380c16881580de81", "8a6a2a253f6f5b0ff75ba39488e6b082",
+ "c5e8159c0f3ebb7536e84ab3dadac1b3", "ef7ec20b46c7dcf16591835642bd68ef",
+ "0c3425399dc64870d726c2837666a55e", "0365029ffbfc4cedf3bf2d757ea5b9df",
+ "836aa403254af2e04d4b7a7c4db8bfc5", "7f2f3f9c91677b233795169f9a88b2b2",
+ "9fc8bbe787244dac638c367b9c611d13", "f66ef45fae8e163ab0f0f393531dad26",
+ "beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff",
+ "be8abad1da69e4d238a45fc02a0061cf",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
+ "e767350f150a84ac5a06dc348e815d62", "53a3a76bf2bcd5761cd15fc739a4f4e1",
+ "7597f69dc19a584280be0d67911db6a6", "e1221c172843dc6c1b345bcd370771cc",
+ "1a640c71ff9bb45505d89761f19efa8f", "e192f64322e0edb250b52f63aaa4de97",
+ "2ccbe012ca167114b14c3ba70befa960", "0f68632d7e5faddb4554ca430d1df822",
+ "8caa0061a26e142b783951d5abd7bf5d", "b01eeed3ec549e4a593100d9c5ba587a",
+ "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
+ "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
+ "346a797b7cb4de10759e329f8b49e077", "d4929154275255f2d786d6fc42c7c5d3",
+ "18a6af6f36ca1ea4ab6f5a76505de040", "0c43e68414bfc02f9b20e796506f643b",
+ "9f483f543f6b1d58e23abf9337ed6fe6", "e114860c2538b63f1be4a23560420cdc",
+ "da8680798f96572c46155c7838b452c3", "20b47a27617297231843c0f2ed7b559b",
+ "16fa4a4f33a32e28c79da83dca63fd41", "76e2c1d3c323777a3c478e11e1ba6bf2",
+ "dccdfd52a71855cc4da18af52bda4c03", "121befbd6c246e85a34225241b8bcaf1",
+ "5780757555fd87ca1ff3f1b498a1d6e9", "6b0be2256285694b1edc0201608e1326",
+ "b7ef338c58d17f69426b5a99170c7295", "b92b84b5b3d01afac02fb9c092e84b06",
+ "e6ef7fea8b183f871c4306c4f49370c5", "c1bf95c05774d8471504e57a3efa66e4",
+ "bbacdbdafc625a139361ec22fe2cf003", "5fbbb2d6ca8fc6d07ca8d4105fda4a01",
+ "c1cbb295d9f00aa865d91a95e96f99b2", "1490e4f2c874a76ecc2bbf35dce446c3",
+ "c3bd73daaeec39895a8b64812773c93c", "6d385068ef3afbd821183d36851f709b",
+ "a34c52ef7f2fd04d1cd420238641ef48", "45d10029358c6835cf968a30605659ea",
+ "a72c1bb18cf9312c5713ce0de370743d", "df7368db2a7515a1c06a4c9dd9e32ebf",
+ "52782632271caccfa9a35ed7533e2052", "6f0ef9b62d2b9956a6464694b7a86b79",
+ "814dbc176f7201725a1cfd1cf668b4b9", "065ffbee984f4b9343c8acb0eb04fcbe",
+ "0915d76ce458d5164e3c90c1ce150795", "bf2b431d9bfa7a9925ea6f6509267ae9",
+ "d3df8c0c940a01b7bf3c3afb80b6dcd4", "15ab86216c9856a8427a51fe599258a3",
+ "2cb078484472c88e26b7401c9f11cf51", "7c5f68cc098c8adabc9e26f9cd549151",
+ "a8e47da1fcc91c2bc74d030892621576", "71af422ba2d86a401f8278591c0ef540",
+ "964c902bb4698ce82f4aa0a1edc80cd6", "78271c37d62af86576dab72ed59746b3",
+ "7247c3a7534a41137027e7d3f255f5ef", "8e529ab964f5f9d0f7c3ced98239cfc8",
+ "2481ed50bff6b36a3cac6dca2aca5ae5", "78a1ff18bf217d45f5170675dee26948",
+ "00fc534119c13aa7af4b818cad9218a2", "67501a83c93f2f9debfa86955bdffde5",
+ "2a512ef738e33a4d8476f72654deffb4", "f4eef28078bbc12de9cfb5bc2fef6238",
+ "b7ac3a35205a978bed587356155bae0e", "51ea101f09c4de2f754b61ab5aff1526",
+ "2bd689d7ec964ee8c8f6f0682f93f5ca", "eecac8dbdaa73b8b3c2234892c444147",
+ "cb7086f44ef70ef919086a3d200d8c13", "0abe35e3c796c2de1e550426b2b19441",
+ "0eb140561e1ea3843464a5247d8ecb18", "d908f7317f00daacbe3dd43495db64ad",
+ "d4d677c4b347de0a13ccab7bc16b8e6e", "26523c2c2df7f31896a3ae5aa24d5ada",
+ "0ebb9f816684769816b2ae0b1f94e3a4", "fd938d0577e3687b0a810e199f69f0bb",
+ "eb8fb832e72030e2aa214936ae0effe4", "56631887763f7daf6e1e73783e5ff656",
+ "590a25cc722c2aa4d885eede5ef09f20", "80944a218ed9b9b0374cde72914449eb",
+ "d9cbc2f1e0e56cdd6722310932db1981", "a88eb213b7a6767bbe639cda120a4ab6",
+ "9972ecbadfdf3ed0b3fedf435c5a804f", "01fdf7e22405a1b17a8d275b7451094f",
+ "6a7824e10406fade0d032e886bbc76b6", "76fefadd793ec3928e915d92782bc7e1",
+ "0fbd6b076752c9f5c926ca5c1df892ac", "aac9457239f07ad633fcd45c1465af2a",
+ "56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442",
+ "f4a4f4d7c8b93c0486cf3cbaa26fbc19",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "79a505b3877177197c94f0faeb0c9ec6", "cd22657d242f30c88bb83eae9efbbcce",
+ "c4c60a60976d119df3832ff6956e0181", "796bd78bf2346e8dfd61cecbf508ea0e",
+ "79e06cc6f880daf6cdb59b9b3a8efe1c", "f0643108e6b57bd566bc0d47b2dc64a1",
+ "8272a471e538ca469eaf5c997309589c", "3094741b63a29925da83dc1dc187654a",
+ "d0141df80f2335ed6051397cb2a5bc61", "33d9fd317b74f4572afbe004f991ca83",
+ "ea2413cd11bf1da93de9285381b471df", "c4f78ae2b994a3a999cb3f5dac2bb498",
+ "44804ec226453bc5f688506b56ad2a8a", "9de9c12a5f3bb8d4af13da8807dfe53f",
+ "c190dac15c08f2e591b222e1d75b60c2", "c46889b58b44d242e24b91ef531e9176",
+ "b6697e1256b60b3426a8980c7c6f9a80", "1e0eb156152fbb74b0cff41bdbdf98b5",
+ "98ab6c0abc45fd44565f84e66dc71133", "f2f2126fac1b7c0c7b7ff511c6f3c91e",
+ "0cc720e878cfa35f9b72762d08adb1bf", "6efee9ce87e098122dd05525f4c74a2f",
+ "187270514a93bd7065d2cfdb02146959", "947be7f2921b5a192d4296b2060a215c",
+ "42f02b046eda2a94133032184fdaa26d", "487e94b20867e7021dd1f10d477c3acf",
+ "9f9eac4394d8821f5c14857a28c5549b", "75d781b60c1f4aa44ceb6bc65f597a52",
+ "779f9ac3c01a86812964ccc38da2711a", "16dc8824efbd7a47808ccdbf8e37df56",
+ "e72899a8ddf6cc816e1917c25739a512", "96a4bcaedae79b55399d931fecd64312",
+ "5c5e8f4a4f0153315133e4e86a02c3a6", "d1c339b6f6cc0eabdd6674028e1f4260",
+ "4ef5868adaf6712d033dce9e51837c0b", "ed90a4ddfc463dddfe71314bc3415b4e",
+ "2312299492a47246269d6d37e67c8c0c", "56baf1c4453c5cf5ce3d6857cff4aa8f",
+ "d534ce3430377b355c3f59695cfb188b", "f40248f1a6fac4299c9645350138f598",
+ "f2e3cbbd066d9d28304667d82312d950", "e8a7784eb367b72b96486bec856b873c",
+ "02941ae2cf8272b353268a30cf9c2ee0", "8f6273a5fa62b9a4225ebdbf2ce44e27",
+ "85bb0aaba73fe8c89dcee6b5c55d5cfc", "c28c63a4e46ee2a98dd2b58379971c8c",
+ "4af35738c29d27ca9930a488bacdffe6", "34a419cc3e6ab21cf099d244169d253e",
+ "7c5b8d19ac8a81b37011fabac10143d0", "e582811e05def83270d8f65060fe8966",
+ "24662536326615a3c325409e780f65bf", "717a7f7e99d329a74391477ef3c6d738",
+ "e0f38a3dba4c6e060b6ca12a18d75fc2", "fbd0cba6a27eb06e74c5ed376187e05c",
+ "14dfb487c4a7e989629a195810b814ee", "3cf6d595317ec46e08f6eaa0f0e99b43",
+ "b3cb98c418ea854e433b612fc532bac5", "262206cee670c082361497e51cbd0f43",
+ "84c11b103a9b0a61f07493dcd269e6fd", "bd9bd9994057371252398bf52c7586f0",
+ "72e5537ba5f04fe17b7a371bd12ca0e2", "5986a20b406ceed273f9e41bc0c4c775",
+ "d5eb9ea00ce19079b49562ba4a8cb574", "3205e6f3c532a63f8d5d939fa46bc444",
+ "cfb21ac467f21954903948d4e6c9a2a1", "bd9fd6aab18bbba8096746f9ed35a640",
+ "d42ec4f13f042014c5b4af5f03d19034", "8a7fdee2b57ac641e03365625850f5d6",
+ "d18638521275b3aa9dd463d067d6a390", "a7a71c433d85576198b52608c99cab47",
+ "96e2a2443bf8cfe32d7590c5011c7523", "6fbe7cd83208937229c11a8e3be5e1e9",
+ "ecf66dac310e332a108be639171b5cf3", "327b1656c61d795c30a914f52e3d7629",
+ "157d26190bde1a6f34680708bff5d02e", "d927bba0073263a7914a4076a5edfe29",
+ "b88930ec68e5e49da8204ef21635cea2", "58e174ed0036b1ac1f5a9bdd44860222",
+ "415055dfa80c6fe7c12e4d16cac22168", "9058939bfb5998d6ecd71d87a52be893",
+ "847894efa35f1528732ec3584f62f86f", "8aa9b33c0d9695690cb4088c32f31214",
+ "11e28ab9a3192a2bc9ffd3fd0a466a13", "f246009c5efafd9310fa8e365d23cab4",
+ "2381fcd9ee0ffceba5509879d9f5709d", "1cf1dc7c7c6ecf1f3381455c99e2239e",
+ "e74601883b53791045f50bbcbbbcc803", "22926eecefa94f9f39b9bb9dbb183e5b",
+ "128c24f5a5342aebb21bdaa87907daf7", "11c39f844a2e51cc4c80ffe1afa58e70",
+ "2c0548cff2145031e304d8f97abfd751", "66e1a3daf84029341b999b18bf86e5b3",
+ "0f790f210d5366bbad7eb352b4909dd9",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct MaskBlendTestParam {
+ MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y,
+ bool is_inter_intra, bool is_wedge_inter_intra)
+ : block_size(block_size),
+ width(kBlockWidthPixels[block_size]),
+ height(kBlockHeightPixels[block_size]),
+ subsampling_x(subsampling_x),
+ subsampling_y(subsampling_y),
+ is_inter_intra(is_inter_intra),
+ is_wedge_inter_intra(is_wedge_inter_intra) {}
+ BlockSize block_size;
+ int width;
+ int height;
+ int subsampling_x;
+ int subsampling_y;
+ bool is_inter_intra;
+ bool is_wedge_inter_intra;
+};
+
+std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
+ return os << ToString(param.block_size)
+ << ", subsampling(x/y): " << param.subsampling_x << "/"
+ << param.subsampling_y
+ << ", is_inter_intra: " << param.is_inter_intra
+ << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra;
+}
+
+template <int bitdepth, typename Pixel>
+class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ MaskBlendTest() = default;
+ ~MaskBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ MaskBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MaskBlendInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MaskBlendInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra)
+ ? dsp->mask_blend[0][param_.is_inter_intra]
+ : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y]
+ [param_.is_inter_intra];
+ func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra
+ ? param_.subsampling_x +
+ param_.subsampling_y
+ : 0];
+ }
+
+ protected:
+ int GetDigestIdOffset() const {
+ // id is for retrieving the corresponding digest from the lookup table given
+ // the set of input parameters. id can be figured out by the block size and
+ // an offset (id_offset).
+ // For example, in kMaskBlendTestParam, this set of parameters
+ // (8, 8, 0, 0, false, false) corresponds to the first entry in the
+ // digest lookup table, where id == 0.
+ // (8, 8, 1, 0, false, false) corresponds to id == 17.
+ // (8, 8, 1, 1, false, false) corresponds to id == 34.
+ // (8, 8, 0, 0, true, false) corresponds to id == 51.
+ // Id_offset denotes offset for different modes (is_inter_intra,
+ // is_wedge_inter_intra).
+ // ...
+ if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+ return param_.subsampling_x * 17 + param_.subsampling_y * 17;
+ }
+ if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+ return 51 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+ }
+ if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
+ return 72 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+ }
+ return 0;
+ }
+
+ int GetDigestId() const {
+ // Only 8x8 and larger blocks are tested.
+ int block_size_adjustment =
+ static_cast<int>(param_.block_size > kBlock16x4);
+ if (param_.is_inter_intra || param_.is_wedge_inter_intra) {
+ // 4:1/1:4 blocks are invalid for these modes.
+ block_size_adjustment += static_cast<int>(param_.block_size > kBlock8x32);
+ block_size_adjustment +=
+ static_cast<int>(param_.block_size > kBlock16x64);
+ block_size_adjustment += static_cast<int>(param_.block_size > kBlock32x8);
+ block_size_adjustment +=
+ static_cast<int>(param_.block_size > kBlock64x16);
+ }
+ return GetDigestIdOffset() + param_.block_size - kBlock8x8 -
+ block_size_adjustment;
+ }
+
+ void Test(const char* digest, int num_runs);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kStride = kMaxSuperBlockSizeInPixels;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel);
+ const MaskBlendTestParam param_ = GetParam();
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ dsp::MaskBlendFunc func_;
+ dsp::InterIntraMaskBlendFunc8bpp func_8bpp_;
+};
+
+template <int bitdepth, typename Pixel>
+void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+ const int num_runs) {
+ if (func_ == nullptr && func_8bpp_ == nullptr) return;
+ const int width = param_.width >> param_.subsampling_x;
+ const int height = param_.height >> param_.subsampling_y;
+
+ // Add id offset to seed just to add more randomness to input blocks.
+ // If we use the same seed for different block sizes, the generated input
+ // blocks are repeated. For example, if input size is 8x8, the generated
+ // block is exactly the upper left half of the generated 16x16 block.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ GetDigestIdOffset());
+ PredType* src_1 = source1_;
+ uint8_t* src_1_8bpp = source1_8bpp_;
+ PredType* src_2 = source2_;
+ uint8_t* src_2_8bpp = source2_8bpp_;
+ const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+ const ptrdiff_t mask_stride = param_.width;
+ uint8_t* mask_row = mask_;
+ const int range_mask = (1 << (bitdepth)) - 1;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+ src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+ if (param_.is_inter_intra && bitdepth == 8) {
+ src_1_8bpp[x] = src_1[x];
+ src_2_8bpp[x] = src_2[x];
+ }
+ if (!param_.is_inter_intra) {
+ // Implies isCompound == true.
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ }
+ src_1 += width;
+ src_1_8bpp += width;
+ src_2 += src_2_stride;
+ src_2_8bpp += src_2_stride;
+ }
+ // Mask should be setup regardless of subsampling.
+ for (int y = 0; y < param_.height; ++y) {
+ for (int x = 0; x < param_.width; ++x) {
+ mask_row[x] = rnd.Rand8() & 63;
+ mask_row[x] += rnd.Rand8() & 1; // Range of mask is [0, 64].
+ }
+ mask_row += mask_stride;
+ }
+
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ const absl::Time start = absl::Now();
+ if (param_.is_inter_intra && bitdepth == 8) {
+ ASSERT_EQ(func_, nullptr);
+ static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
+ // source2_8bpp_ is modified in the call.
+ memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
+ func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, mask_stride,
+ width, height);
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x];
+ }
+ }
+ memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_));
+ } else {
+ if (bitdepth != 8) {
+ ASSERT_EQ(func_8bpp_, nullptr);
+ }
+ ASSERT_NE(func_, nullptr);
+ func_(source1_, source2_, src_2_stride, mask_, mask_stride, width, height,
+ dest_, kDestStride);
+ }
+ elapsed_time += absl::Now() - start;
+ }
+
+ test_utils::CheckMd5Digest("MaskBlend", ToString(param_.block_size), digest,
+ dest_, sizeof(dest_), elapsed_time);
+}
+
+const MaskBlendTestParam kMaskBlendTestParam[] = {
+ // is_inter_intra = false, is_wedge_inter_intra = false.
+ // block size range is from 8x8 to 128x128.
+ MaskBlendTestParam(kBlock8x8, 0, 0, false, false),
+ MaskBlendTestParam(kBlock8x16, 0, 0, false, false),
+ MaskBlendTestParam(kBlock8x32, 0, 0, false, false),
+ MaskBlendTestParam(kBlock16x8, 0, 0, false, false),
+ MaskBlendTestParam(kBlock16x16, 0, 0, false, false),
+ MaskBlendTestParam(kBlock16x32, 0, 0, false, false),
+ MaskBlendTestParam(kBlock16x64, 0, 0, false, false),
+ MaskBlendTestParam(kBlock32x8, 0, 0, false, false),
+ MaskBlendTestParam(kBlock32x16, 0, 0, false, false),
+ MaskBlendTestParam(kBlock32x32, 0, 0, false, false),
+ MaskBlendTestParam(kBlock32x64, 0, 0, false, false),
+ MaskBlendTestParam(kBlock64x16, 0, 0, false, false),
+ MaskBlendTestParam(kBlock64x32, 0, 0, false, false),
+ MaskBlendTestParam(kBlock64x64, 0, 0, false, false),
+ MaskBlendTestParam(kBlock64x128, 0, 0, false, false),
+ MaskBlendTestParam(kBlock128x64, 0, 0, false, false),
+ MaskBlendTestParam(kBlock128x128, 0, 0, false, false),
+ MaskBlendTestParam(kBlock8x8, 1, 0, false, false),
+ MaskBlendTestParam(kBlock8x16, 1, 0, false, false),
+ MaskBlendTestParam(kBlock8x32, 1, 0, false, false),
+ MaskBlendTestParam(kBlock16x8, 1, 0, false, false),
+ MaskBlendTestParam(kBlock16x16, 1, 0, false, false),
+ MaskBlendTestParam(kBlock16x32, 1, 0, false, false),
+ MaskBlendTestParam(kBlock16x64, 1, 0, false, false),
+ MaskBlendTestParam(kBlock32x8, 1, 0, false, false),
+ MaskBlendTestParam(kBlock32x16, 1, 0, false, false),
+ MaskBlendTestParam(kBlock32x32, 1, 0, false, false),
+ MaskBlendTestParam(kBlock32x64, 1, 0, false, false),
+ MaskBlendTestParam(kBlock64x16, 1, 0, false, false),
+ MaskBlendTestParam(kBlock64x32, 1, 0, false, false),
+ MaskBlendTestParam(kBlock64x64, 1, 0, false, false),
+ MaskBlendTestParam(kBlock64x128, 1, 0, false, false),
+ MaskBlendTestParam(kBlock128x64, 1, 0, false, false),
+ MaskBlendTestParam(kBlock128x128, 1, 0, false, false),
+ MaskBlendTestParam(kBlock8x8, 1, 1, false, false),
+ MaskBlendTestParam(kBlock8x16, 1, 1, false, false),
+ MaskBlendTestParam(kBlock8x32, 1, 1, false, false),
+ MaskBlendTestParam(kBlock16x8, 1, 1, false, false),
+ MaskBlendTestParam(kBlock16x16, 1, 1, false, false),
+ MaskBlendTestParam(kBlock16x32, 1, 1, false, false),
+ MaskBlendTestParam(kBlock16x64, 1, 1, false, false),
+ MaskBlendTestParam(kBlock32x8, 1, 1, false, false),
+ MaskBlendTestParam(kBlock32x16, 1, 1, false, false),
+ MaskBlendTestParam(kBlock32x32, 1, 1, false, false),
+ MaskBlendTestParam(kBlock32x64, 1, 1, false, false),
+ MaskBlendTestParam(kBlock64x16, 1, 1, false, false),
+ MaskBlendTestParam(kBlock64x32, 1, 1, false, false),
+ MaskBlendTestParam(kBlock64x64, 1, 1, false, false),
+ MaskBlendTestParam(kBlock64x128, 1, 1, false, false),
+ MaskBlendTestParam(kBlock128x64, 1, 1, false, false),
+ MaskBlendTestParam(kBlock128x128, 1, 1, false, false),
+ // is_inter_intra = true, is_wedge_inter_intra = false.
+ // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+ // Read inter intra syntax).
+ MaskBlendTestParam(kBlock8x8, 0, 0, true, false),
+ MaskBlendTestParam(kBlock8x16, 0, 0, true, false),
+ MaskBlendTestParam(kBlock16x8, 0, 0, true, false),
+ MaskBlendTestParam(kBlock16x16, 0, 0, true, false),
+ MaskBlendTestParam(kBlock16x32, 0, 0, true, false),
+ MaskBlendTestParam(kBlock32x16, 0, 0, true, false),
+ MaskBlendTestParam(kBlock32x32, 0, 0, true, false),
+ MaskBlendTestParam(kBlock8x8, 1, 0, true, false),
+ MaskBlendTestParam(kBlock8x16, 1, 0, true, false),
+ MaskBlendTestParam(kBlock16x8, 1, 0, true, false),
+ MaskBlendTestParam(kBlock16x16, 1, 0, true, false),
+ MaskBlendTestParam(kBlock16x32, 1, 0, true, false),
+ MaskBlendTestParam(kBlock32x16, 1, 0, true, false),
+ MaskBlendTestParam(kBlock32x32, 1, 0, true, false),
+ MaskBlendTestParam(kBlock8x8, 1, 1, true, false),
+ MaskBlendTestParam(kBlock8x16, 1, 1, true, false),
+ MaskBlendTestParam(kBlock16x8, 1, 1, true, false),
+ MaskBlendTestParam(kBlock16x16, 1, 1, true, false),
+ MaskBlendTestParam(kBlock16x32, 1, 1, true, false),
+ MaskBlendTestParam(kBlock32x16, 1, 1, true, false),
+ MaskBlendTestParam(kBlock32x32, 1, 1, true, false),
+ // is_inter_intra = true, is_wedge_inter_intra = true.
+ // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+ // Read inter intra syntax).
+ MaskBlendTestParam(kBlock8x8, 0, 0, true, true),
+ MaskBlendTestParam(kBlock8x16, 0, 0, true, true),
+ MaskBlendTestParam(kBlock16x8, 0, 0, true, true),
+ MaskBlendTestParam(kBlock16x16, 0, 0, true, true),
+ MaskBlendTestParam(kBlock16x32, 0, 0, true, true),
+ MaskBlendTestParam(kBlock32x16, 0, 0, true, true),
+ MaskBlendTestParam(kBlock32x32, 0, 0, true, true),
+ MaskBlendTestParam(kBlock8x8, 1, 0, true, true),
+ MaskBlendTestParam(kBlock8x16, 1, 0, true, true),
+ MaskBlendTestParam(kBlock16x8, 1, 0, true, true),
+ MaskBlendTestParam(kBlock16x16, 1, 0, true, true),
+ MaskBlendTestParam(kBlock16x32, 1, 0, true, true),
+ MaskBlendTestParam(kBlock32x16, 1, 0, true, true),
+ MaskBlendTestParam(kBlock32x32, 1, 0, true, true),
+ MaskBlendTestParam(kBlock8x8, 1, 1, true, true),
+ MaskBlendTestParam(kBlock8x16, 1, 1, true, true),
+ MaskBlendTestParam(kBlock16x8, 1, 1, true, true),
+ MaskBlendTestParam(kBlock16x16, 1, 1, true, true),
+ MaskBlendTestParam(kBlock16x32, 1, 1, true, true),
+ MaskBlendTestParam(kBlock32x16, 1, 1, true, true),
+ MaskBlendTestParam(kBlock32x32, 1, 1, true, true),
+};
+
+using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
+
+TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest8bpp, DISABLED_Speed) {
+ Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>;
+
+TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest10bpp, DISABLED_Speed) {
+ Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest10bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using MaskBlendTest12bpp = MaskBlendTest<12, uint16_t>;
+
+TEST_P(MaskBlendTest12bpp, Blending) { Test(GetDigest12bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest12bpp, DISABLED_Speed) {
+ Test(GetDigest12bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest12bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when MotionFieldProjectionKernel_C is
+// not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+// 7.9.2.
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+ int reference_to_current_with_sign,
+ int dst_sign, int y8_start, int y8_end,
+ int x8_start, int x8_end,
+ TemporalMotionField* motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8;
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
+ int x8 = adjusted_x8_start;
+ do {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ } while (++x8 < adjusted_x8_end);
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+} // namespace
+
+void MotionFieldProjectionInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMotionFieldWidth = 160;
+constexpr int kMotionFieldHight = 120;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionFieldProjectionTest : public testing::TestWithParam<int> {
+ public:
+ MotionFieldProjectionTest() = default;
+ MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete;
+ MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) =
+ delete;
+ ~MotionFieldProjectionTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(8);
+ MotionFieldProjectionInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MotionFieldProjectionInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MotionFieldProjectionInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(8);
+ ASSERT_NE(dsp, nullptr);
+ target_motion_field_projection_kernel_func_ =
+ dsp->motion_field_projection_kernel;
+ }
+
+ void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd);
+ void TestRandomValues(bool speed);
+
+ private:
+ MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_;
+ ReferenceInfo reference_info_;
+ TemporalMotionField motion_field_;
+};
+
+void MotionFieldProjectionTest::SetInputData(
+ const int motion_field_width, libvpx_test::ACMRandom* const rnd) {
+ ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width));
+ ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width,
+ /*zero_initialize=*/false));
+ ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight,
+ motion_field_width,
+ /*zero_initialize=*/false));
+ constexpr int order_hint_bits = 6;
+ unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits);
+ const unsigned int current_frame_order_hint =
+ rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63]
+ uint8_t reference_frame_order_hint = 0;
+ reference_info_.relative_distance_to[0] = 0;
+ reference_info_.skip_references[kReferenceFrameIntra] = true;
+ reference_info_.projection_divisions[kReferenceFrameIntra] = 0;
+ for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) {
+ reference_frame_order_hint =
+ rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63]
+ const int relative_distance_to =
+ GetRelativeDistance(current_frame_order_hint,
+ reference_frame_order_hint, order_hint_shift_bits);
+ reference_info_.relative_distance_to[i] = relative_distance_to;
+ reference_info_.skip_references[i] =
+ relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+ reference_info_.projection_divisions[i] =
+ reference_info_.skip_references[i]
+ ? 0
+ : kProjectionMvDivisionLookup[relative_distance_to];
+ }
+ for (int y = 0; y < kMotionFieldHight; ++y) {
+ for (int x = 0; x < motion_field_width; ++x) {
+ reference_info_.motion_field_reference_frame[y][x] =
+ static_cast<ReferenceFrameType>(rnd->Rand16() &
+ kReferenceFrameAlternate);
+ reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512;
+ reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512;
+ }
+ }
+ MotionVector invalid_mv;
+ invalid_mv.mv[0] = kInvalidMvValue;
+ invalid_mv.mv[1] = kInvalidMvValue;
+ MotionVector* const motion_field_mv = &motion_field_.mv[0][0];
+ int8_t* const motion_field_reference_offset =
+ &motion_field_.reference_offset[0][0];
+ std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(),
+ invalid_mv);
+ std::fill(
+ motion_field_reference_offset,
+ motion_field_reference_offset + motion_field_.reference_offset.size(),
+ -128);
+}
+
+void MotionFieldProjectionTest::TestRandomValues(bool speed) {
+ static const char* const kDigestMv[8] = {
+ "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7",
+ "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3",
+ "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47",
+ "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"};
+ static const char* const kDigestReferenceOffset[8] = {
+ "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540",
+ "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36",
+ "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e",
+ "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"};
+ const int num_tests = speed ? 2000 : 1;
+ if (target_motion_field_projection_kernel_func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int width_idx = 0; width_idx < 8; ++width_idx) {
+ const int motion_field_width = kMotionFieldWidth + width_idx;
+ SetInputData(motion_field_width, &rnd);
+ const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1;
+ const int reference_to_current_with_sign =
+ rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance;
+ assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance);
+ // Step of y8 and x8 is at least 16 except the last hop.
+ for (int step = 16; step <= 80; step += 16) {
+ const absl::Time start = absl::Now();
+ for (int k = 0; k < num_tests; ++k) {
+ for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) {
+ const int y8_end = std::min(y8 + step, kMotionFieldHight);
+ for (int x8 = 0; x8 < motion_field_width; x8 += step) {
+ const int x8_end = std::min(x8 + step, motion_field_width);
+ target_motion_field_projection_kernel_func_(
+ reference_info_, reference_to_current_with_sign, dst_sign, y8,
+ y8_end, x8, x8_end, &motion_field_);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MotionFieldProjectionKernel",
+ absl::StrFormat("(mv) width %d step %d", motion_field_width, step)
+ .c_str(),
+ kDigestMv[width_idx], motion_field_.mv[0],
+ sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(),
+ elapsed_time);
+ test_utils::CheckMd5Digest(
+ "MotionFieldProjectionKernel",
+ absl::StrFormat("(ref offset) width %d step %d", motion_field_width,
+ step)
+ .c_str(),
+ kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0],
+ sizeof(motion_field_.reference_offset[0][0]) *
+ motion_field_.reference_offset.size(),
+ elapsed_time);
+ }
+ }
+}
+
+TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when the C functions are not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+void MvProjectionCompoundLowPrecision_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ for (auto& mv : candidate_mvs[index].mv[i].mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionCompoundForceInteger_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ for (auto& mv : candidate_mvs[index].mv[i].mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionCompoundHighPrecision_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ int index = 0;
+ do {
+ candidate_mvs[index].mv64 = 0;
+ for (int i = 0; i < 2; ++i) {
+ // |offsets| non-zero check usually equals true and could be ignored.
+ if (offsets[i] != 0) {
+ GetMvProjection(
+ temporal_mvs[index], offsets[i],
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index].mv[i]);
+ }
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleLowPrecision_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ for (auto& mv : candidate_mvs[index].mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleForceInteger_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ for (auto& mv : candidate_mvs[index].mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ } while (++index < count);
+}
+
+void MvProjectionSingleHighPrecision_C(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+ int index = 0;
+ do {
+ GetMvProjection(
+ temporal_mvs[index], reference_offset,
+ kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+ &candidate_mvs[index]);
+ } while (++index < count);
+}
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+ // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+} // namespace
+
+void MotionVectorSearchInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_vector_search_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionVectorSearchTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ MotionVectorSearchTest() = default;
+ MotionVectorSearchTest(const MotionVectorSearchTest&) = delete;
+ MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete;
+ ~MotionVectorSearchTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(8);
+ MotionVectorSearchInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MotionVectorSearchInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ MotionVectorSearchInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(8);
+ ASSERT_NE(dsp, nullptr);
+ mv_projection_compound_[0] = dsp->mv_projection_compound[0];
+ mv_projection_compound_[1] = dsp->mv_projection_compound[1];
+ mv_projection_compound_[2] = dsp->mv_projection_compound[2];
+ mv_projection_single_[0] = dsp->mv_projection_single[0];
+ mv_projection_single_[1] = dsp->mv_projection_single[1];
+ mv_projection_single_[2] = dsp->mv_projection_single[2];
+ }
+
+ void SetInputData(libvpx_test::ACMRandom* rnd);
+ void TestRandomValues(bool speed);
+
+ private:
+ MvProjectionCompoundFunc mv_projection_compound_[3];
+ MvProjectionSingleFunc mv_projection_single_[3];
+ int reference_offsets_[2];
+ alignas(kMaxAlignment)
+ MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding];
+ int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding];
+ CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ alignas(kMaxAlignment)
+ CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ alignas(kMaxAlignment)
+ MotionVector single_mv_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+};
+
+void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) {
+ reference_offsets_[0] =
+ Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+ reference_offsets_[1] =
+ Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+ for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) {
+ temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance);
+ for (auto& mv : temporal_mvs_[i].mv) {
+ mv = rnd->Rand16Signed() / 8;
+ }
+ }
+ for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) {
+ for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ single_mv_[i][j].mv[k] = rnd->Rand16Signed();
+ for (auto& mv : compound_mv_[i][j].mv[k].mv) {
+ mv = rnd->Rand16Signed();
+ }
+ }
+ compound_mv_org_[i][j] = compound_mv_[i][j];
+ single_mv_org_[i][j] = single_mv_[i][j];
+ }
+ }
+}
+
+void MotionVectorSearchTest::TestRandomValues(bool speed) {
+ static const char* const kDigestCompound[3] = {
+ "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9",
+ "e42de30cd84fa4e7b8581a330ed08a8b"};
+ static const char* const kDigestSingle[3] = {
+ "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc",
+ "7e699d58df3954a38ff11c8e34151e66"};
+ const int num_tests = speed ? 1000000 : 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int function_index = 0; function_index < 3; ++function_index) {
+ SetInputData(&rnd);
+ if (mv_projection_compound_[function_index] == nullptr) continue;
+ const absl::Time start = absl::Now();
+ for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+ const int total_count = count + (count & 1);
+ for (int i = 0; i < num_tests; ++i) {
+ mv_projection_compound_[function_index](
+ temporal_mvs_, temporal_reference_offsets_, reference_offsets_,
+ count, compound_mv_[count]);
+ }
+ // One more element could be calculated in SIMD implementations.
+ // Restore the original values if any.
+ for (int i = count; i < total_count; ++i) {
+ compound_mv_[count][i] = compound_mv_org_[count][i];
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MvProjectionCompound",
+ absl::StrFormat("function_index %d", function_index).c_str(),
+ kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_),
+ elapsed_time);
+ }
+ for (int function_index = 0; function_index < 3; ++function_index) {
+ SetInputData(&rnd);
+ if (mv_projection_single_[function_index] == nullptr) continue;
+ const absl::Time start = absl::Now();
+ for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+ const int total_count = (count + 3) & ~3;
+ for (int i = 0; i < num_tests; ++i) {
+ mv_projection_single_[function_index](
+ temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0],
+ count, single_mv_[count]);
+ }
+ // Up to three more elements could be calculated in SIMD implementations.
+ // Restore the original values if any.
+ for (int i = count; i < total_count; ++i) {
+ single_mv_[count][i] = single_mv_org_[count][i];
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MvProjectionSingle",
+ absl::StrFormat("function_index %d", function_index).c_str(),
+ kDigestSingle[function_index], single_mv_, sizeof(single_mv_),
+ elapsed_time);
+ }
+}
+
+TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+// 7.11.3.10 (from top samples).
+template <typename Pixel>
+void OverlapBlendVertical_C(void* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int width,
+ const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<Pixel*>(prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+ const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+ const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+ const uint8_t* const mask = kObmcMask + height - 2;
+ assert(width >= 4);
+ assert(height >= 2);
+
+ for (int y = 0; y < height; ++y) {
+ const uint8_t mask_value = mask[y];
+ for (int x = 0; x < width; ++x) {
+ pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+ }
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ }
+}
+
+// 7.11.3.10 (from left samples).
+template <typename Pixel>
+void OverlapBlendHorizontal_C(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<Pixel*>(prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+ const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+ const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+ const uint8_t* const mask = kObmcMask + width - 2;
+ assert(width >= 2);
+ assert(height >= 4);
+
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const uint8_t mask_value = mask[x];
+ pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+ mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+ }
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void ObmcInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_OBMC_H_
+#define LIBGAV1_SRC_DSP_OBMC_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/obmc_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/obmc_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_OBMC_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for overlap blend implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2.
+constexpr uint8_t kObmcMask[62] = {
+ // Obmc Mask 2
+ 45, 64,
+ // Obmc Mask 4
+ 39, 50, 59, 64,
+ // Obmc Mask 8
+ 36, 42, 48, 53, 57, 61, 64, 64,
+ // Obmc Mask 16
+ 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+ // Obmc Mask 32
+ 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+ 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kMaxBlendingBlockSize = 64;
+constexpr int kNumSpeedTests = 2e8;
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
+ "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
+ "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
+ "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
+ "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
+ "e2152dea159249149ff4151111b73ed6", "1edd570bec7e63780d83588f6aacda25",
+ "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
+ "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
+ "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
+ "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
+ "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed8bpp(int id) {
+ static const char* const kDigest[] = {
+ "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
+ "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
+ "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
+ "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
+ "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
+ "e3a1960b0a7238db1184a3f9d8e9a4b2", "ba9938553703d520bc0ade427c397140",
+ "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
+ "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
+ "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
+ "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
+ "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
+ "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
+ "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
+ "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
+ "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
+ "d2e70b81882aeb3d9fccef89e7552a9d", "f5d882ee6d9ae6f7dfa467ca99301424",
+ "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
+ "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
+ "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
+ "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
+ "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed10bpp(int id) {
+ static const char* const kDigest[] = {
+ "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
+ "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
+ "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
+ "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
+ "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
+ "a148bbafdc4466fbb700b31acccca8ac", "5da9d960988549f53b817003b93e4d01",
+ "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
+ "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
+ "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
+ "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
+ "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "eb18c776d7b56280f01cca40b04a9c44", "058d4a6ed025eac5dcf7aec3203c0882",
+ "8355884d7470e9c6af9309ab23bee859", "2ba330551ac58d1d034b947d7ab9b59f",
+ "0d25cd773c81e4c57f82513e3b031f01", "b9075f7c3b9a240dbb015a24454eeb71",
+ "563ed8683723d1e4f2746280bca3db0a", "d7125306bd8c952d0f85fe1515ca16a7",
+ "5bf99c7e4a918c9b6a7e251484ea6527", "38ac9c685e8d2bd2771b6f2b38268301",
+ "abc39dbde7470e08b15417ee97c704b2", "37e12753d23b7a8df92b1d32f3170d9f",
+ "9a609776cfa31f64826225d0a6b7afdd", "ccdd89e70e94f751fd891b124c1c3210",
+ "2bbf7b095e26ed4f27e7d05e20117084", "9a1b403c3a7c00da5686bcb87f1270e8",
+ "701d651e391043ab8ebbd0023a430980", "0047f10bdd8321494e8e82597fe2f969",
+ "f97e662d139b2811e3d3227de95135a2", "852933b90d4a70f9254157381ed641e0",
+ "cfcda707ec8e4361ef741dc716888348", "95e34eab83b3159f61685db248c6a881",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed12bpp(int id) {
+ static const char* const kDigest[] = {
+ "6c0f37c41d72ce40d95545ac0f08d88a", "8a8efeb7d8b2f852d76d0176b6c6878f",
+ "5757c88d1cdc0cd29c47c346474161f0", "fef8cf06d16ba7357bfc061e43080cd3",
+ "6bd11582448532bce8b91cc8807ab6a0", "1e6dd42eada2d636e210f4e20a771102",
+ "377a0472f45fcb42f1712243ea845530", "e3760f2b6e69c1b40e71ecde711d227c",
+ "6721638d1a5dadb96ddd0ca067c737ca", "3d3a23210a8496a76991bcec5045808b",
+ "2cbd26ecf7d4e927ab569083d3ddb4ca", "7d61af2d7841d1a39a2e930bac166804",
+ "dd929506442fb1f2e67130fe8cdf487b", "c0e57f8d2546d5bcb646a24d09d83d7c",
+ "2989c6487456c92eb003c8e17e904f45", "5cfb60a3be6ee5c41e0f655a3020f687",
+ "28f37d47cb07aa382659ff556a55a4c6", "b6478ab317b11f592deb60d02ce62f2f",
+ "bc78e7250c101f82e794d4fa0ee55025", "24304ed23d336a46f205206d3c5d48ef",
+ "dc1e71d95d06c1086bb7f9e05e38bf39", "32606ef72985e7de608df2e8760784b7",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct ObmcTestParam {
+ ObmcTestParam(int width, int height, ObmcDirection blending_direction)
+ : width(width), height(height), blending_direction(blending_direction) {}
+ int width;
+ int height;
+ ObmcDirection blending_direction;
+};
+
+std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height
+ << ", blending_direction: " << ToString(param.blending_direction);
+}
+
+template <int bitdepth, typename Pixel>
+class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ ObmcBlendTest() = default;
+ ~ObmcBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ ObmcInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ ObmcInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ ObmcInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->obmc_blend[blending_direction_];
+ }
+
+ protected:
+ int GetDigestId() const {
+ // blending_direction_ == kObmcDirectionVertical:
+ // (width, height):
+ // (4, 2), id = 0. (4, 4), id = 1. (4, 8), id = 2. (8, 4), id = 3.
+ // ...
+ // blending_direction_ == kObmcDirectionHorizontal: id starts from 11.
+ // Vertical skips (2, 4) while horizontal skips (4, 2) creating a gap after
+ // (2, 4).
+ const int id = (blending_direction_ == kObmcDirectionVertical) ? 0
+ : (width_ == 2) ? 12
+ : 11;
+ if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1) - 2;
+ if (width_ < height_) return id + 3 * (FloorLog2(width_) - 1) - 1;
+ return id + 3 * (FloorLog2(height_) - 1);
+ }
+
+ // Note |digest| is only used when |use_fixed_values| is false.
+ void Test(const char* digest, bool use_fixed_values, int value);
+ void TestSpeed(const char* digest, int num_runs);
+
+ private:
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ const ObmcDirection blending_direction_ = GetParam().blending_direction;
+ Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+ Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+ dsp::ObmcBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+ const bool use_fixed_values,
+ const int value) {
+ if (func_ == nullptr) return;
+ if (use_fixed_values) {
+ std::fill(source1_,
+ source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+ std::fill(source2_,
+ source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+ } else {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ Pixel* src_1 = source1_;
+ Pixel* src_2 = source2_;
+ const int mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ src_1[x] = rnd.Rand16() & mask;
+ src_2[x] = rnd.Rand16() & mask;
+ }
+ src_1 += kMaxBlendingBlockSize;
+ src_2 += width_;
+ }
+ }
+ const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+ func_(source1_, stride, width_, height_, source2_,
+ width_ * sizeof(source2_[0]));
+ if (use_fixed_values) {
+ const bool success = test_utils::CompareBlocks(
+ source1_, source2_, width_, height_, kMaxBlendingBlockSize,
+ kMaxBlendingBlockSize, false);
+ EXPECT_TRUE(success);
+ } else {
+ test_utils::CheckMd5Digest(
+ ToString(blending_direction_),
+ absl::StrFormat("%dx%d", width_, height_).c_str(), digest, source1_,
+ sizeof(source1_), absl::Duration());
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+ const int num_runs) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ Pixel* src_1 = source1_;
+ Pixel* src_2 = source2_;
+ const int mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ src_1[x] = rnd.Rand16() & mask;
+ src_2[x] = rnd.Rand16() & mask;
+ }
+ src_1 += kMaxBlendingBlockSize;
+ src_2 += width_;
+ }
+ const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+ uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ memcpy(dest, source1_,
+ sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+ const absl::Time start = absl::Now();
+ func_(dest, stride, width_, height_, source2_,
+ width_ * sizeof(source2_[0]));
+ elapsed_time += absl::Now() - start;
+ }
+ memcpy(source1_, dest,
+ sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+ test_utils::CheckMd5Digest(ToString(blending_direction_),
+ absl::StrFormat("%dx%d", width_, height_).c_str(),
+ digest, source1_, sizeof(source1_), elapsed_time);
+}
+
+const ObmcTestParam kObmcTestParam[] = {
+ ObmcTestParam(4, 2, kObmcDirectionVertical),
+ ObmcTestParam(4, 4, kObmcDirectionVertical),
+ ObmcTestParam(4, 8, kObmcDirectionVertical),
+ ObmcTestParam(8, 4, kObmcDirectionVertical),
+ ObmcTestParam(8, 8, kObmcDirectionVertical),
+ ObmcTestParam(8, 16, kObmcDirectionVertical),
+ ObmcTestParam(16, 8, kObmcDirectionVertical),
+ ObmcTestParam(16, 16, kObmcDirectionVertical),
+ ObmcTestParam(16, 32, kObmcDirectionVertical),
+ ObmcTestParam(32, 16, kObmcDirectionVertical),
+ ObmcTestParam(32, 32, kObmcDirectionVertical),
+ ObmcTestParam(2, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(4, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(4, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 32, kObmcDirectionHorizontal),
+ ObmcTestParam(32, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(32, 32, kObmcDirectionHorizontal),
+};
+
+using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>;
+
+TEST_P(ObmcBlendTest8bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255);
+ Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
+ TestSpeed(GetDigestSpeed8bpp(GetDigestId()),
+ kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>;
+
+TEST_P(ObmcBlendTest10bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1);
+ Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
+ TestSpeed(GetDigestSpeed10bpp(GetDigestId()),
+ kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
+ testing::ValuesIn(kObmcTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest10bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ObmcBlendTest12bpp = ObmcBlendTest<12, uint16_t>;
+
+TEST_P(ObmcBlendTest12bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 12) - 1);
+ Test(GetDigest12bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest12bpp, DISABLED_Speed) {
+ TestSpeed(GetDigestSpeed12bpp(GetDigestId()),
+ kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest12bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Each row below contains weights used for a corresponding block size. Because
+// they are adjacent powers of 2, the index of each row is the sum of the sizes
+// of preceding rows, minus 4.
+// The weights need to be declared as uint8_t or uint16_t, depending on the
+// bitdepth, so the values are held in a single canonical place.
+// clang-format off
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+ // clang-format on
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cassert>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void SuperRes_C(const void* /*coefficients*/,
+ void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t dest_stride) {
+ assert(step <= 1 << kSuperResScaleBits);
+ auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<Pixel*>(dest);
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ // If (original) upscaled_width is <= 9, the downscaled_width may be
+ // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+ // subsampled via RightShiftWithRounding. This leads to an edge case where
+ // |step| == 1 << 14.
+ int subpixel_x = initial_subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+ const int src_x_subpixel =
+ (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+ // The sign of each tap is: - + - + + - + -
+ sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+ sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+ sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+ sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+ sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+ sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+ sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+ sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+ dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+ (1 << bitdepth) - 1);
+ subpixel_x += step;
+ } while (++x < upscaled_width);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+ dsp->super_res = SuperRes_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+ dsp->super_res = SuperRes_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_SuperRes
+ dsp->super_res = SuperRes_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void SuperResInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
+#define LIBGAV1_SRC_DSP_SUPER_RES_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/super_res_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_SUPER_RES_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e5;
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9",
+ "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"};
+ return kDigestSuperRes[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3",
+ "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
+ return kDigestSuperRes[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "9a08983d82df4983700976f18919201b", "6e5edbafcb6c38db37258bf79c00ea32",
+ "f5c57e6d3b518f9585f768ed19b91568", "b5de9b93c8a1a50580e7c7c9456fb615"};
+ return kDigestSuperRes[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct SuperResTestParam {
+ SuperResTestParam(int downscaled_width, int upscaled_width)
+ : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {}
+ int downscaled_width;
+ int upscaled_width;
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ SuperResTest() = default;
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ SuperResInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const std::vector<std::string> split_test_name =
+ absl::StrSplit(test_info->name(), '/');
+ ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_));
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ SuperResInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ SuperResInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ super_res_coefficients_ = dsp->super_res_coefficients;
+ func_ = dsp->super_res;
+ }
+
+ void TestComputeSuperRes(int fixed_value, int num_runs);
+
+ private:
+ static constexpr int kHeight = 127;
+ // The maximum width that must be allocated.
+ static constexpr int kUpscaledBufferWidth = 192;
+ // Allow room for the filter taps.
+ static constexpr int kStride =
+ ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15);
+ const int kDownscaledWidth = GetParam().downscaled_width;
+ const int kUpscaledWidth = GetParam().upscaled_width;
+ int test_id_;
+ SuperResCoefficientsFunc super_res_coefficients_;
+ SuperResFunc func_;
+ Pixel source_buffer_[kHeight][kStride];
+ alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride];
+ alignas(kMaxAlignment) Coefficient
+ superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth];
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
+ int fixed_value, int num_runs) {
+ if (func_ == nullptr) return;
+ const int superres_width = kDownscaledWidth << kSuperResScaleBits;
+ const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth;
+ const int error = step * kUpscaledWidth - superres_width;
+ const int initial_subpixel_x =
+ ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) +
+ DivideBy2(kUpscaledWidth)) /
+ kUpscaledWidth +
+ (1 << (kSuperResExtraBits - 1)) - error / 2) &
+ kSuperResScaleMask;
+ if (super_res_coefficients_ != nullptr) {
+ super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step,
+ superres_coefficients_);
+ }
+ memset(dest_buffer_, 0, sizeof(dest_buffer_));
+ if (fixed_value != 0) {
+ SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride);
+ } else {
+ // Random values.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int bitdepth_mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kStride; ++x) {
+ source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask;
+ }
+ }
+ }
+ // Offset starting point in the buffer to accommodate line extension.
+ Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder;
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth,
+ kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+
+ if (fixed_value != 0) {
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kUpscaledWidth; ++x) {
+ EXPECT_TRUE(dest_buffer_[y][x] == fixed_value)
+ << "At location [" << y << ", " << x
+ << "]\nexpected: " << fixed_value
+ << "\nactual: " << dest_buffer_[y][x];
+ }
+ }
+ } else if (num_runs == 1) {
+ // Random values.
+ if ((kUpscaledWidth & 15) != 0) {
+ // The SIMD functions overwrite up to 15 pixels in each row. Reset them.
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) {
+ dest_buffer_[y][x] = 0;
+ }
+ }
+ }
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(test_id_);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigest10bpp(test_id_);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(test_id_);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(
+ "SuperRes",
+ absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
+ initial_subpixel_x)
+ .c_str(),
+ expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time);
+ } else {
+ // Speed test.
+ printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n",
+ kUpscaledWidth, step, initial_subpixel_x,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+}
+
+using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>;
+
+TEST_P(SuperResTest8bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(255, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest8bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+const SuperResTestParam kSuperResTestParams[] = {
+ SuperResTestParam(96, 192),
+ SuperResTestParam(171, 192),
+ SuperResTestParam(102, 128),
+ SuperResTestParam(61, 121),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>;
+
+TEST_P(SuperResTest10bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(511, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest10bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SuperResTest12bpp = SuperResTest<12, uint16_t, int16_t>;
+
+TEST_P(SuperResTest12bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(2047, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest12bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest12bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest12bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// Warp prediction output ranges from WarpTest.ShowRange.
+// Bitdepth: 8 Input range: [ 0, 255]
+// 8bpp intermediate offset: 16384.
+// intermediate range: [ 4399, 61009]
+// first pass output range: [ 550, 7626]
+// 8bpp intermediate offset removal: 262144.
+// intermediate range: [ -620566, 1072406]
+// second pass output range: [ 0, 255]
+// compound second pass output range: [ -4848, 8378]
+//
+// Bitdepth: 10 Input range: [ 0, 1023]
+// intermediate range: [ -48081, 179025]
+// first pass output range: [ -6010, 22378]
+// intermediate range: [-2103516, 4198620]
+// second pass output range: [ 0, 1023]
+// compound second pass output range: [ 8142, 57378]
+//
+// Bitdepth: 12 Input range: [ 0, 4095]
+// intermediate range: [ -192465, 716625]
+// first pass output range: [ -6015, 22395]
+// intermediate range: [-2105190, 4201830]
+// second pass output range: [ 0, 4095]
+// compound second pass output range: [ 8129, 57403]
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
+ const int source_width, const int source_height,
+ const int* LIBGAV1_RESTRICT const warp_params,
+ const int subsampling_x, const int subsampling_y,
+ const int block_start_x, const int block_start_y,
+ const int block_width, const int block_height, const int16_t alpha,
+ const int16_t beta, const int16_t gamma, const int16_t delta,
+ void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) {
+ assert(block_width >= 8 && block_height >= 8);
+ if (is_compound) {
+ assert(dest_stride == block_width);
+ }
+ constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical
+ : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical;
+
+ // Only used for 8bpp. Allows for keeping the first pass intermediates within
+ // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
+ constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
+ constexpr int offset_removal =
+ (first_pass_offset >> kRoundBitsHorizontal) * 128;
+
+ constexpr int kMaxPixel = (1 << bitdepth) - 1;
+ union {
+ // |intermediate_result| is the output of the horizontal filtering and
+ // rounding. The range is within int16_t.
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+ const auto* const src = static_cast<const Pixel*>(source);
+ source_stride /= sizeof(Pixel);
+ using DestType =
+ typename std::conditional<is_compound, uint16_t, Pixel>::type;
+ auto* dst = static_cast<DestType*>(dest);
+ if (!is_compound) dest_stride /= sizeof(dst[0]);
+
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+
+ // Warp process applies for each 8x8 block (or smaller).
+ for (int start_y = block_start_y; start_y < block_start_y + block_height;
+ start_y += 8) {
+ for (int start_x = block_start_x; start_x < block_start_x + block_width;
+ start_x += 8) {
+ const int src_x = (start_x + 4) << subsampling_x;
+ const int src_y = (start_y + 4) << subsampling_y;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
+
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
+ // Regions 1 and 2.
+ // Points to the left or right border of the first row of |src|.
+ const Pixel* first_row_border =
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const Pixel row_border_pixel = first_row_border[row * source_stride];
+ DestType* dst_row = dst + start_x - block_start_x;
+ if (is_compound) {
+ int sum = row_border_pixel
+ << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ Memset(dst_row, sum, 8);
+ } else {
+ Memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+ // End of region 1. Continue the |start_x| for loop.
+ continue;
+ }
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved below.
+ const int row = filter_params.iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= kFilterBits - kRoundBitsHorizontal;
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ int sum = 0;
+ for (int k = 0; k < 8; ++k) {
+ sum +=
+ kWarpedFilters[offset][k] * intermediate_result_column[y + k];
+ }
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst_row[x] = static_cast<DestType>(sum);
+ } else {
+ dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+ }
+ sy += gamma;
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ // End of region 2. Continue the |start_x| for loop.
+ continue;
+ }
+
+ // Regions 3 and 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+ // It follows that -6 <= ix4 <= source_width + 5. This inequality is
+ // used below.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
+ // Region 3.
+ // Horizontal filter.
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const Pixel* const src_row = src + row * source_stride;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ for (int x = -4; x < 4; ++x) {
+ const int offset =
+ RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since alpha and beta have been validated by SetupShear(), one
+ // can prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ // For SIMD optimization:
+ // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+ // For 10/12 bit, the range of sum requires 32 bits.
+ int sum = first_pass_offset;
+ for (int k = 0; k < 8; ++k) {
+ // We assume the source frame has left and right borders of at
+ // least 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+ // ix4 above, we have
+ // -13 <= ix4 + x + k - 3 <= source_width + 12,
+ // or
+ // -13 <= column <= (source_width - 1) + 13.
+ // Therefore we may over-read up to 13 pixels before the source
+ // row, or up to 13 pixels after the source row.
+ const int column = filter_params.ix4 + x + k - 3;
+ sum += kWarpedFilters[offset][k] * src_row[column];
+ }
+ intermediate_result[y + 7][x + 4] =
+ RightShiftWithRounding(sum, kRoundBitsHorizontal);
+ sx += alpha;
+ }
+ sx4 += beta;
+ }
+ } else {
+ // Region 4.
+ // Horizontal filter.
+ // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
+ // It follows that -6 <= iy4 <= source_height + 5. This inequality is
+ // used below.
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We assume the source frame has top and bottom borders of at least
+ // 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -7 <= y <= 7, using the inequality on iy4 above, we have
+ // -13 <= iy4 + y <= source_height + 12,
+ // or
+ // -13 <= row <= (source_height - 1) + 13.
+ // Therefore we may over-read up to 13 pixels above the top source
+ // row, or up to 13 pixels below the bottom source row.
+ const int row = filter_params.iy4 + y;
+ const Pixel* const src_row = src + row * source_stride;
+ int sx = sx4 - MultiplyBy4(alpha);
+ for (int x = -4; x < 4; ++x) {
+ const int offset =
+ RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since alpha and beta have been validated by SetupShear(), one
+ // can prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ // For SIMD optimization:
+ // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+ // For 10/12 bit, the range of sum requires 32 bits.
+ int sum = first_pass_offset;
+ for (int k = 0; k < 8; ++k) {
+ // We assume the source frame has left and right borders of at
+ // least 13 pixels that extend the frame boundary pixels.
+ //
+ // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+ // ix4 above, we have
+ // -13 <= ix4 + x + k - 3 <= source_width + 12,
+ // or
+ // -13 <= column <= (source_width - 1) + 13.
+ // Therefore we may over-read up to 13 pixels before the source
+ // row, or up to 13 pixels after the source row.
+ const int column = filter_params.ix4 + x + k - 3;
+ sum += kWarpedFilters[offset][k] * src_row[column];
+ }
+ intermediate_result[y + 7][x + 4] =
+ RightShiftWithRounding(sum, kRoundBitsHorizontal) -
+ offset_removal;
+ sx += alpha;
+ }
+ sx4 += beta;
+ }
+ }
+
+ // Regions 3 and 4.
+ // Vertical filter.
+ DestType* dst_row = dst + start_x - block_start_x;
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
+ // The spec says we should use the following loop condition:
+ // y < std::min(4, block_start_y + block_height - start_y - 4);
+ // We can prove that block_start_y + block_height - start_y >= 8, which
+ // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
+ // So the loop condition is simply y < 4.
+ //
+ // Proof:
+ // start_y < block_start_y + block_height
+ // => block_start_y + block_height - start_y > 0
+ // => block_height - (start_y - block_start_y) > 0
+ //
+ // Since block_height >= 8 and is a power of 2, it follows that
+ // block_height is a multiple of 8. start_y - block_start_y is also a
+ // multiple of 8. Therefore their difference is a multiple of 8. Since
+ // their difference is > 0, their difference must be >= 8.
+ //
+ // We then add an offset of 4 to y so that the loop starts with y = 0
+ // and continues if y < 8.
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ // The spec says we should use the following loop condition:
+ // x < std::min(4, block_start_x + block_width - start_x - 4);
+ // Similar to the above, we can prove that the loop condition can be
+ // simplified to x < 4.
+ //
+ // We then add an offset of 4 to x so that the loop starts with x = 0
+ // and continues if x < 8.
+ for (int x = 0; x < 8; ++x) {
+ const int offset =
+ RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ // Since gamma and delta have been validated by SetupShear(), one can
+ // prove that 0 <= offset <= 3 * 2^6.
+ assert(offset >= 0);
+ assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+ int sum = 0;
+ for (int k = 0; k < 8; ++k) {
+ sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
+ }
+ sum -= offset_removal;
+ sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+ if (is_compound) {
+ sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+ dst_row[x] = static_cast<DestType>(sum);
+ } else {
+ dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+ }
+ sy += gamma;
+ }
+ dst_row += dest_stride;
+ sy4 += delta;
+ }
+ }
+ dst += 8 * dest_stride;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void WarpInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WARP_H_
+#define LIBGAV1_SRC_DSP_WARP_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/warp_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/warp_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.11.3.5.
+struct WarpFilterParams {
+ int64_t x4;
+ int64_t y4;
+ int ix4;
+ int iy4;
+};
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_C();
+
+// Section 7.11.3.5.
+inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y,
+ int subsampling_x,
+ int subsampling_y,
+ const int* warp_params) {
+ WarpFilterParams filter_params;
+ // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions
+ // the result of the multiplication will require 33.
+ const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] +
+ src_y * warp_params[3] + warp_params[0];
+ const int64_t dst_y = src_x * warp_params[4] +
+ static_cast<int64_t>(src_y) * warp_params[5] +
+ warp_params[1];
+ filter_params.x4 = dst_x >> subsampling_x;
+ filter_params.y4 = dst_y >> subsampling_y;
+ filter_params.ix4 =
+ static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits);
+ filter_params.iy4 =
+ static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits);
+ return filter_params;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_WARP_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/post_filter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kSourceBorderHorizontal = 16;
+constexpr int kSourceBorderVertical = 13;
+
+constexpr int kMaxSourceBlockWidth =
+ kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2;
+constexpr int kMaxSourceBlockHeight =
+ kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2;
+constexpr int kMaxDestBlockWidth =
+ kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+constexpr int kMaxDestBlockHeight =
+ kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+
+constexpr uint16_t kDivisorLookup[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192};
+
+template <bool is_compound>
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0",
+ "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9",
+ "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41",
+ "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac",
+ "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d",
+ "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72",
+ "7f17d290d513a7416761b3a01f10fd2f",
+ };
+ static const char* const kCompoundDigest[] = {
+ "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c",
+ "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc",
+ "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50",
+ "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2",
+ "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af",
+ "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
+ "b2a0ce68db3cadd207299f73112bed74",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <bool is_compound>
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b",
+ "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521",
+ "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97",
+ "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c",
+ "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33",
+ "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd",
+ "5ed615091e2f62df26de7e91a985cb81",
+ };
+ static const char* const kCompoundDigest[] = {
+ "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+ "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+ "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+ "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+ "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+ "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+ "42eb66e752e9ef289b47053b5c73fdd6",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+template <bool is_compound>
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "cd5d5e2102b8917ad70778f523d24bdf", "374a5f1b53a3fdf2eefa741eb71e6889",
+ "311636841770ec2427084891df96bee5", "c40c537917b1f0d1d84c99dfcecd8219",
+ "a1d9bb920e6c3d20c0cf84adc18e1f15", "13b5659acdb39b717526cb358c6f4026",
+ "f81ea4f6fd1f4ebed1262e3fae37b5bb", "c1452fefcd9b9562fe3a0b7f9302809c",
+ "8fed8a3159dc7b6b59a39ab2be6bee13", "b46458bc0e5cf1cee92aac4f0f608749",
+ "2e6a1039ab111add89f5b44b13565f40", "9c666691860bdc89b03f601b40126196",
+ "418a47157d992b94c302ca2e2f6ee07e",
+ };
+ static const char* const kCompoundDigest[] = {
+ "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+ "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+ "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+ "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+ "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+ "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+ "42eb66e752e9ef289b47053b5c73fdd6",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+int RandomWarpedParam(int seed_offset, int bits) {
+ libvpx_test::ACMRandom rnd(seed_offset +
+ libvpx_test::ACMRandom::DeterministicSeed());
+ // 1 in 8 chance of generating zero (arbitrary).
+ const bool zero = (rnd.Rand16() & 7) == 0;
+ if (zero) return 0;
+ // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 <<
+ // bits].
+ const int mask = (1 << bits) - 1;
+ const int value = 1 + (rnd.RandRange(1u << 31) & mask);
+ const bool sign = (rnd.Rand16() & 1) != 0;
+ return sign ? value : -value;
+}
+
+// This function is a copy from warp_prediction.cc.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+ int16_t* division_shift) {
+ const int n = FloorLog2(std::abs(value));
+ const T e = std::abs(value) - (static_cast<T>(1) << n);
+ const int entry = (n > kDivisorLookupBits)
+ ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+ : static_cast<int>(e << (kDivisorLookupBits - n));
+ *division_shift = n + kDivisorLookupPrecisionBits;
+ *division_factor =
+ (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// This function is a copy from warp_prediction.cc.
+int16_t GetShearParameter(int value) {
+ return static_cast<int16_t>(
+ LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits),
+ kWarpParamRoundingBits));
+}
+
+// This function is a copy from warp_prediction.cc.
+// This function is used here to help generate valid warp parameters.
+bool SetupShear(const int* params, int16_t* alpha, int16_t* beta,
+ int16_t* gamma, int16_t* delta) {
+ int16_t division_shift;
+ int16_t division_factor;
+ GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+ &division_shift);
+ const int alpha0 =
+ Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX);
+ const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX);
+ const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+ const int gamma0 =
+ Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift),
+ INT16_MIN, INT16_MAX);
+ const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+ const int delta0 = Clip3(
+ params[5] -
+ RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+ (1 << kWarpedModelPrecisionBits),
+ INT16_MIN, INT16_MAX);
+
+ *alpha = GetShearParameter(alpha0);
+ *beta = GetShearParameter(beta0);
+ *gamma = GetShearParameter(gamma0);
+ *delta = GetShearParameter(delta0);
+ if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >=
+ (1 << kWarpedModelPrecisionBits)) ||
+ (4 * std::abs(*gamma) + 4 * std::abs(*delta) >=
+ (1 << kWarpedModelPrecisionBits))) {
+ return false; // NOLINT (easier condition to understand).
+ }
+
+ return true;
+}
+
+void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta,
+ int16_t* gamma, int16_t* delta, int seed) {
+ do {
+ params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ ++seed;
+ } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta));
+}
+
+struct WarpTestParam {
+ WarpTestParam(int width, int height) : width(width), height(height) {}
+ int width;
+ int height;
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+class WarpTest : public testing::TestWithParam<WarpTestParam> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ WarpTest() = default;
+ ~WarpTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ WarpInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ WarpInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ WarpInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = is_compound ? dsp->warp_compound : dsp->warp;
+ }
+
+ protected:
+ using DestType =
+ typename std::conditional<is_compound, uint16_t, Pixel>::type;
+
+ void SetInputData(bool use_fixed_values, int value);
+ void Test(bool use_fixed_values, int value, int num_runs = 1);
+ void TestFixedValues();
+ void TestRandomValues();
+ void TestSpeed();
+
+ const WarpTestParam param_ = GetParam();
+
+ private:
+ int warp_params_[8];
+ dsp::WarpFunc func_;
+ // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop)
+ // padding. Destination buffer indices are based on subsampling values (x+y):
+ // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0).
+ Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {};
+ DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {};
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+ int value) {
+ if (use_fixed_values) {
+ for (int y = 0; y < param_.height; ++y) {
+ const int row = kSourceBorderVertical + y;
+ Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal,
+ value, param_.width);
+ }
+ } else {
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int y = 0; y < param_.height; ++y) {
+ const int row = kSourceBorderVertical + y;
+ for (int x = 0; x < param_.width; ++x) {
+ const int column = kSourceBorderHorizontal + x;
+ source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask;
+ }
+ }
+ }
+ PostFilter::ExtendFrame<Pixel>(
+ &source_[kSourceBorderVertical * kMaxSourceBlockWidth +
+ kSourceBorderHorizontal],
+ param_.width, param_.height, kMaxSourceBlockWidth,
+ kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical,
+ kSourceBorderVertical);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
+ int value,
+ int num_runs /*= 1*/) {
+ if (func_ == nullptr) return;
+ SetInputData(use_fixed_values, value);
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int source_offset =
+ kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal;
+ const int dest_offset =
+ kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop;
+ const Pixel* const src = source_ + source_offset;
+ const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel);
+ const ptrdiff_t dst_stride =
+ is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel);
+
+ absl::Duration elapsed_time;
+ for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) {
+ for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) {
+ if (subsampling_x == 0 && subsampling_y == 1) {
+ // When both are 0: 4:4:4
+ // When both are 1: 4:2:0
+ // When only |subsampling_x| is 1: 4:2:2
+ // Having only |subsampling_y| == 1 is unsupported.
+ continue;
+ }
+ int params[8];
+ int16_t alpha;
+ int16_t beta;
+ int16_t gamma;
+ int16_t delta;
+ GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8());
+
+ const int dest_id = subsampling_x + subsampling_y;
+ DestType* const dst = dest_[dest_id] + dest_offset;
+ const absl::Time start = absl::Now();
+ for (int n = 0; n < num_runs; ++n) {
+ func_(src, src_stride, param_.width, param_.height, params,
+ subsampling_x, subsampling_y, 0, 0, param_.width, param_.height,
+ alpha, beta, gamma, delta, dst, dst_stride);
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ }
+
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) {
+ // |is_compound| holds a few more bits of precision and an offset value.
+ Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight];
+ const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+ if (is_compound) {
+ for (int y = 0; y < param_.height; ++y) {
+ for (int x = 0; x < param_.width; ++x) {
+ const int compound_value =
+ dest_[i][dest_offset + y * kMaxDestBlockWidth + x];
+ const int remove_offset = compound_value - compound_offset;
+ const int full_shift =
+ remove_offset >>
+ (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical);
+ compensated_dest[y * kMaxDestBlockWidth + x] =
+ Clip3(full_shift, 0, (1 << bitdepth) - 1);
+ }
+ }
+ }
+ Pixel* pixel_dest =
+ is_compound ? compensated_dest
+ : reinterpret_cast<Pixel*>(dest_[i] + dest_offset);
+ const bool success = test_utils::CompareBlocks(
+ src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth,
+ kMaxDestBlockWidth, false);
+ EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i;
+ }
+ } else {
+ // (width, height):
+ // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2.
+ // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5.
+ // ...
+ // (128, 128), id = 12.
+ int id;
+ if (param_.width == param_.height) {
+ id = 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+ } else if (param_.width < param_.height) {
+ id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+ } else {
+ id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
+ }
+
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp<is_compound>(id);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigest10bpp<is_compound>(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp<is_compound>(id);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(
+ "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+ expected_digest, dest_, sizeof(dest_), elapsed_time);
+ }
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << bitdepth) - 1);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() {
+ Test(false, 0);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ int min = 0, max = 0;
+ for (int i = 0; i < kSubPixelTaps; ++i) {
+ const int tap = filter[i];
+ if (tap > 0) {
+ max += max_input * tap;
+ min += min_input * tap;
+ } else {
+ min += max_input * tap;
+ max += min_input * tap;
+ }
+ }
+ *min_output = min;
+ *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Warp process.
+template <int bitdepth>
+void ShowRange() {
+ constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int vertical_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical;
+ constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical;
+
+ constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+ constexpr int max_input = (1 << bitdepth) - 1;
+
+ const int8_t* worst_warp_filter = kWarpedFilters8[93];
+
+ // First pass.
+ printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0,
+ max_input);
+
+ int min = 0, max = 0;
+ ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max);
+
+ int first_pass_offset;
+ if (bitdepth == 8) {
+ // Derive an offset for 8 bit.
+ for (first_pass_offset = 1; - first_pass_offset > min;
+ first_pass_offset <<= 1) {
+ }
+ printf(" 8bpp intermediate offset: %d.\n", first_pass_offset);
+ min += first_pass_offset;
+ max += first_pass_offset;
+ assert(min > 0);
+ assert(max < UINT16_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+ // offset is not required.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+ const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+ printf(" first pass output range: [%8d, %8d]\n", first_pass_min,
+ first_pass_max);
+
+ // Second pass.
+ if (bitdepth == 8) {
+ ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max);
+ } else {
+ ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter,
+ &min, &max);
+ }
+
+ if (bitdepth == 8) {
+ // Remove the offset that was applied in the first pass since we must use
+ // int32_t for this phase anyway. 128 is the sum of the filter taps.
+ const int offset_removal = (first_pass_offset >> horizontal_bits) * 128;
+ printf(" 8bpp intermediate offset removal: %d.\n", offset_removal);
+ max -= offset_removal;
+ min -= offset_removal;
+ assert(min < INT16_MIN && min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+ // offset is not required.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ // Second pass non-compound output is clipped to Pixel values.
+ const int second_pass_min =
+ Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+ const int second_pass_max =
+ Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+ printf(" second pass output range: [%8d, %8d]\n", second_pass_min,
+ second_pass_max);
+
+ // Output is Pixel so matches Pixel values.
+ assert(second_pass_min == 0);
+ assert(second_pass_max == max_input);
+
+ const int compound_second_pass_min =
+ RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+ const int compound_second_pass_max =
+ RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+ printf(" compound second pass output range: [%8d, %8d]\n",
+ compound_second_pass_min, compound_second_pass_max);
+
+ if (bitdepth == 8) {
+ // 8bpp output is int16_t without an offset.
+ assert(compound_second_pass_min > INT16_MIN);
+ assert(compound_second_pass_max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp use the offset to fit inside uint16_t.
+ assert(compound_second_pass_min > 0);
+ assert(compound_second_pass_max < UINT16_MAX);
+ }
+
+ printf("\n");
+}
+
+TEST(WarpTest, ShowRange) {
+ ShowRange<kBitdepth8>();
+ ShowRange<kBitdepth10>();
+ ShowRange<kBitdepth12>();
+}
+
+using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>;
+
+// Verifies the sum of the warped filter coefficients is 128 for every filter.
+//
+// Verifies the properties used in the calculation of ranges of variables in
+// the block warp process:
+// * The maximum sum of the positive warped filter coefficients is 175.
+// * The minimum (i.e., most negative) sum of the negative warped filter
+// coefficients is -47.
+//
+// NOTE: This test is independent of the bitdepth and the implementation of the
+// block warp function, so it just needs to be a test in the WarpTest8bpp class
+// and does not need to be defined with TEST_P.
+TEST(WarpTest8bpp, WarpedFilterCoefficientSums) {
+ int max_positive_sum = 0;
+ int min_negative_sum = 0;
+ for (const auto& filter : kWarpedFilters) {
+ int sum = 0;
+ int positive_sum = 0;
+ int negative_sum = 0;
+ for (const auto coefficient : filter) {
+ sum += coefficient;
+ if (coefficient > 0) {
+ positive_sum += coefficient;
+ } else {
+ negative_sum += coefficient;
+ }
+ }
+ EXPECT_EQ(sum, 128);
+ max_positive_sum = std::max(positive_sum, max_positive_sum);
+ min_negative_sum = std::min(negative_sum, min_negative_sum);
+ }
+ EXPECT_EQ(max_positive_sum, 175);
+ EXPECT_EQ(min_negative_sum, -47);
+}
+
+TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); }
+const WarpTestParam warp_test_param[] = {
+ WarpTestParam(8, 8), WarpTestParam(8, 16), WarpTestParam(16, 8),
+ WarpTestParam(16, 16), WarpTestParam(16, 32), WarpTestParam(32, 16),
+ WarpTestParam(32, 32), WarpTestParam(32, 64), WarpTestParam(64, 32),
+ WarpTestParam(64, 64), WarpTestParam(64, 128), WarpTestParam(128, 64),
+ WarpTestParam(128, 128),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp,
+ testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp,
+ testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>;
+
+TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest10bpp,
+ testing::ValuesIn(warp_test_param));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WarpTest12bpp = WarpTest</*is_compound=*/false, 12, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest12bpp = WarpTest</*is_compound=*/true, 12, uint16_t>;
+
+TEST_P(WarpTest12bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest12bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest12bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest12bpp, testing::ValuesIn(warp_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
+ return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int width, int height, int bitdepth, bool mask_is_inverse>
+void WeightMask_C(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+ const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+ static_assert(width >= 8, "");
+ static_assert(height >= 8, "");
+ constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int difference = RightShiftWithRounding(
+ std::abs(pred_0[x] - pred_1[x]), rounding_bits);
+ const auto mask_value =
+ static_cast<uint8_t>(std::min(DivideBy16(difference) + 38, 64));
+ mask[x] = mask_is_inverse ? 64 - mask_value : mask_value;
+ }
+ pred_0 += width;
+ pred_1 += width;
+ mask += mask_stride;
+ }
+}
+
+#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask_C<width, height, bitdepth, 0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask_C<width, height, bitdepth, 1>
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+void WeightMaskInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/weight_mask_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/weight_mask_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+constexpr int kMaxPredictionSize = 128;
+// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with
+// convolve producing the most extreme ranges.
+// This includes kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "eaca5b6a96dcfe5e44f3926a071b48b3",
+ "1d82c75cfdf8e57925eb1d5301647538",
+ "25bd455d74fb891b97b133c528f8db60",
+ "" /*kBlock4x16*/,
+ "1d82c75cfdf8e57925eb1d5301647538",
+ "25bd455d74fb891b97b133c528f8db60",
+ "62a08776db35a186406a11ab92dee71c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "25bd455d74fb891b97b133c528f8db60",
+ "62a08776db35a186406a11ab92dee71c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "0b3c75272e0fb0747b9850145d340c4c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "0b3c75272e0fb0747b9850145d340c4c",
+ "f26c43d4bc823a89c1ed47ab8708bc06",
+ "0d99bbf31ecddc1c2d5063a68c0e9375",
+ "0d99bbf31ecddc1c2d5063a68c0e9375",
+ "5fb8ec5f582f0ebfe519ed55860f67c4",
+
+ // mask_is_inverse = true.
+ "96811f3b192828ff679e4c9ad8069d7d",
+ "a04dc180c028d55af70240163445523a",
+ "8513e3988233d0a7de316a0179bb6139",
+ "" /*kBlock4x16*/,
+ "a04dc180c028d55af70240163445523a",
+ "8513e3988233d0a7de316a0179bb6139",
+ "f7356d42fb44a6ccb41253ba35b8b3c7",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "8513e3988233d0a7de316a0179bb6139",
+ "f7356d42fb44a6ccb41253ba35b8b3c7",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "87a2011ac69fb597ca4f71bb3c35ebb0",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "87a2011ac69fb597ca4f71bb3c35ebb0",
+ "97100a3639d567046dc8a99fcb84cb2e",
+ "9fabe05a6523da81a45150e19f75acff",
+ "9fabe05a6523da81a45150e19f75acff",
+ "7c0643e4d02421d06d7ca71822a94e1d",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "5ae8d64b65a671301a457b8a73368ab5",
+ "61535217f179054d4b76a8d9352a223d",
+ "1aa6614773570e7b021cd509849c4180",
+ "" /*kBlock4x16*/,
+ "61535217f179054d4b76a8d9352a223d",
+ "1aa6614773570e7b021cd509849c4180",
+ "f04c2825cfb6408c7778658f71fa176e",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "1aa6614773570e7b021cd509849c4180",
+ "f04c2825cfb6408c7778658f71fa176e",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "9c4855d44c013fbddb373b2e9e311080",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "9c4855d44c013fbddb373b2e9e311080",
+ "f510e743c3efe3b83374a98ef8a30838",
+ "b6e0bd03c521c5f00e90530daa7d4432",
+ "b6e0bd03c521c5f00e90530daa7d4432",
+ "3270d7f621d488aec5b76bcf121debd0",
+
+ // mask_is_inverse = true.
+ "9aa00fcfe21b71e30c5393699122a020",
+ "4d8ce33262cf6b5375f363530815189a",
+ "428625c51ac1bd4585988f7b36dff1db",
+ "" /*kBlock4x16*/,
+ "4d8ce33262cf6b5375f363530815189a",
+ "428625c51ac1bd4585988f7b36dff1db",
+ "1ef63c06a2d9c42da293fdf924032981",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "428625c51ac1bd4585988f7b36dff1db",
+ "1ef63c06a2d9c42da293fdf924032981",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "fe1e6843e6f214939da516dcbea04a79",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "fe1e6843e6f214939da516dcbea04a79",
+ "240187f27389b5e89f9ec6bdbd7d20a7",
+ "44925dab01011a98b8ab1f0308fa852a",
+ "44925dab01011a98b8ab1f0308fa852a",
+ "6d984b2ccfa056278e2130771127a943",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "57629d3872fd52ff4bbec439c5517ec5",
+ "dba421ceeb534756c77167e00ae91a2c",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "" /*kBlock4x16*/,
+ "dba421ceeb534756c77167e00ae91a2c",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "ae573eb368df04e6a0133b4e15471728",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "ae573eb368df04e6a0133b4e15471728",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "c4976af803d7ad3f92ef26f25b9f3754",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "c4976af803d7ad3f92ef26f25b9f3754",
+ "1d957d49f71bb7f304705a11a597f0cb",
+ "9522d5713fb951b79f42d78fbff914cf",
+ "9522d5713fb951b79f42d78fbff914cf",
+ "422c046013f79a9f46e2c855967570ba",
+
+ // mask_is_inverse = true.
+ "a585cca9bc459d10e081bc0eb847b6e3",
+ "2fa4ec5f74fad2831d216c51c2cdad5a",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "" /*kBlock4x16*/,
+ "2fa4ec5f74fad2831d216c51c2cdad5a",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "2ddd8c8a1841501964011030e2557e20",
+ "97ef2575023dda008711015cf08d7590",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "2ddd8c8a1841501964011030e2557e20",
+ "97ef2575023dda008711015cf08d7590",
+ "d69aff1e0d43395ce305c9be0dfb4c89",
+ "97ef2575023dda008711015cf08d7590",
+ "d69aff1e0d43395ce305c9be0dfb4c89",
+ "48786f640191dcbee5b3321672778519",
+ "6ad4718230353440b01f2bb78348157e",
+ "6ad4718230353440b01f2bb78348157e",
+ "ad49bd7af0ea17c84f434c7dfd0a911d",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct WeightMaskTestParam {
+ WeightMaskTestParam(int width, int height, bool mask_is_inverse)
+ : width(width), height(height), mask_is_inverse(mask_is_inverse) {}
+ int width;
+ int height;
+ bool mask_is_inverse;
+};
+
+std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) {
+ return os << param.width << "x" << param.height
+ << ", mask_is_inverse: " << param.mask_is_inverse;
+}
+
+template <int bitdepth>
+class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ WeightMaskTest() = default;
+ ~WeightMaskTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ WeightMaskInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const int width_index = FloorLog2(width_) - 3;
+ const int height_index = FloorLog2(height_) - 3;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ WeightMaskInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ WeightMaskInit_SSE4_1();
+ }
+ func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
+ }
+
+ protected:
+ void SetInputData(bool use_fixed_values, int value_1, int value_2);
+ void Test(int num_runs, bool use_fixed_values, int value_1, int value_2);
+
+ private:
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ const bool mask_is_inverse_ = GetParam().mask_is_inverse;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ alignas(
+ kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize];
+ alignas(
+ kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize];
+ uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {};
+ dsp::WeightMaskFunc func_;
+};
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values,
+ const int value_1,
+ const int value_2) {
+ if (use_fixed_values) {
+ std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize,
+ value_1);
+ std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize,
+ value_2);
+ } else {
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ block_1_[y * width_ + x] =
+ static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ block_2_[y * width_ + x] =
+ static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ }
+ }
+}
+
+BlockSize DimensionsToBlockSize(int width, int height) {
+ if (width == 4) {
+ if (height == 4) return kBlock4x4;
+ if (height == 8) return kBlock4x8;
+ if (height == 16) return kBlock4x16;
+ return kBlockInvalid;
+ }
+ if (width == 8) {
+ if (height == 4) return kBlock8x4;
+ if (height == 8) return kBlock8x8;
+ if (height == 16) return kBlock8x16;
+ if (height == 32) return kBlock8x32;
+ return kBlockInvalid;
+ }
+ if (width == 16) {
+ if (height == 4) return kBlock16x4;
+ if (height == 8) return kBlock16x8;
+ if (height == 16) return kBlock16x16;
+ if (height == 32) return kBlock16x32;
+ if (height == 64) return kBlock16x64;
+ return kBlockInvalid;
+ }
+ if (width == 32) {
+ if (height == 8) return kBlock32x8;
+ if (height == 16) return kBlock32x16;
+ if (height == 32) return kBlock32x32;
+ if (height == 64) return kBlock32x64;
+ return kBlockInvalid;
+ }
+ if (width == 64) {
+ if (height == 16) return kBlock64x16;
+ if (height == 32) return kBlock64x32;
+ if (height == 64) return kBlock64x64;
+ if (height == 128) return kBlock64x128;
+ return kBlockInvalid;
+ }
+ if (width == 128) {
+ if (height == 64) return kBlock128x64;
+ if (height == 128) return kBlock128x128;
+ return kBlockInvalid;
+ }
+ return kBlockInvalid;
+}
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::Test(const int num_runs,
+ const bool use_fixed_values,
+ const int value_1, const int value_2) {
+ if (func_ == nullptr) return;
+ SetInputData(use_fixed_values, value_1, value_2);
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ func_(block_1_, block_2_, mask_, width_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (use_fixed_values) {
+ int fixed_value = (value_1 - value_2 == 0) ? 38 : 64;
+ if (mask_is_inverse_) fixed_value = 64 - fixed_value;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ ASSERT_EQ(static_cast<int>(mask_[y * width_ + x]), fixed_value)
+ << "x: " << x << " y: " << y;
+ }
+ }
+ } else {
+ const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
+ const int id = id_offset +
+ static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(id);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigest10bpp(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(id);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(
+ absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+ "WeightMask", expected_digest, mask_, sizeof(mask_), elapsed_time);
+ }
+}
+
+const WeightMaskTestParam weight_mask_test_param[] = {
+ WeightMaskTestParam(8, 8, false), WeightMaskTestParam(8, 16, false),
+ WeightMaskTestParam(8, 32, false), WeightMaskTestParam(16, 8, false),
+ WeightMaskTestParam(16, 16, false), WeightMaskTestParam(16, 32, false),
+ WeightMaskTestParam(16, 64, false), WeightMaskTestParam(32, 8, false),
+ WeightMaskTestParam(32, 16, false), WeightMaskTestParam(32, 32, false),
+ WeightMaskTestParam(32, 64, false), WeightMaskTestParam(64, 16, false),
+ WeightMaskTestParam(64, 32, false), WeightMaskTestParam(64, 64, false),
+ WeightMaskTestParam(64, 128, false), WeightMaskTestParam(128, 64, false),
+ WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true),
+ WeightMaskTestParam(8, 16, true), WeightMaskTestParam(8, 32, true),
+ WeightMaskTestParam(16, 8, true), WeightMaskTestParam(16, 16, true),
+ WeightMaskTestParam(16, 32, true), WeightMaskTestParam(16, 64, true),
+ WeightMaskTestParam(32, 8, true), WeightMaskTestParam(32, 16, true),
+ WeightMaskTestParam(32, 32, true), WeightMaskTestParam(32, 64, true),
+ WeightMaskTestParam(64, 16, true), WeightMaskTestParam(64, 32, true),
+ WeightMaskTestParam(64, 64, true), WeightMaskTestParam(64, 128, true),
+ WeightMaskTestParam(128, 64, true), WeightMaskTestParam(128, 128, true),
+};
+
+using WeightMaskTest8bpp = WeightMaskTest<8>;
+
+TEST_P(WeightMaskTest8bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[0][0];
+ const int max = kCompoundPredictionRange[0][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest8bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WeightMaskTest10bpp = WeightMaskTest<10>;
+
+TEST_P(WeightMaskTest10bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[1][0];
+ const int max = kCompoundPredictionRange[1][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest10bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WeightMaskTest12bpp = WeightMaskTest<12>;
+
+TEST_P(WeightMaskTest12bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[2][0];
+ const int max = kCompoundPredictionRange[2][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest12bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest12bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest12bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ Store4(dest, result_pixels);
+ dest += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dest, &result_1, sizeof(result_1));
+ dest += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dest, &result_2, sizeof(result_2));
+ dest += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dest, &result_3, sizeof(result_3));
+}
+
+inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ StoreLo8(dest, result_pixels);
+ StoreHi8(dest + dest_stride, result_pixels);
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ const int width,
+ uint8_t* LIBGAV1_RESTRICT dest) {
+ int x = 0;
+ do {
+ const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+ const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+ __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+ res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+ const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+ const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+ __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+ res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+ StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+ x += 16;
+ } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ const ptrdiff_t dest_stride4 = dest_stride << 2;
+ constexpr ptrdiff_t width4 = 4 << 2;
+ do {
+ AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride4;
+ pred_0 += width4;
+ pred_1 += width4;
+
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ constexpr ptrdiff_t width2 = 8 << 1;
+ do {
+ AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ const __m128i& compound_offset,
+ const __m128i& round_offset, const __m128i& max,
+ const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dest_stride) {
+ // pred_0/1 max range is 16b.
+ const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+ const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+ const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+ const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+ const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+ const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+ const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+ const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+ const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+ const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+ // RightShiftWithRounding and Clip3.
+ const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+ const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+ const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+ const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+ if (width != 4) {
+ // Store width=8/16/32/64/128.
+ StoreUnaligned16(dst + offset, result);
+ return;
+ }
+ assert(width == 4);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const __m128i compound_offset =
+ _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+ const __m128i round_offset =
+ _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ int y = height;
+
+ if (width == 4) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0,1
+ AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 16) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 32) {
+ do {
+ // pred [0 - 15].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [16 - 31].
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ if (width == 64) {
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ assert(width == 128);
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+
+ // pred [64 - 95].
+ AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [96 - 127].
+ AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+ 420, 210, 140, 105, 420, 210, 140, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_d1_temp[8];
+ const __m256i v_zero = _mm256_setzero_si256();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t stride, __m256i* partial) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m256i v_src[8];
+ for (auto& i : v_src) {
+ i = _mm256_castsi128_si256(LoadLo8(src));
+ // Dup lower lane.
+ i = _mm256_permute2x128_si256(i, i, 0x0);
+ src += stride;
+ }
+
+ const __m256i v_zero = _mm256_setzero_si256();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 33 41 51 61 71 xx xx xx xx xx xx xx xx
+ // 02 12 22 33 42 52 62 72 xx xx xx xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ // 06 16 26 36 46 56 66 76 xx xx xx xx xx xx xx xx
+ // 07 17 27 37 47 57 67 77 xx xx xx xx xx xx xx xx
+ const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+ const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+ const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+ const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+ const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial[2] =
+ _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ const __m256i extend_reverse = SetrM128i(
+ _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+ static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+ _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+ static_cast<int>(0x80048005),
+ static_cast<int>(0x80068007)));
+
+ for (auto& i : v_src) {
+ // Zero extend unsigned 8 to 16. The upper lane is reversed.
+ i = _mm256_shuffle_epi8(i, extend_reverse);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // 40 41 42 43 44 45 46 47 xx xx xx xx xx xx xx xx
+ // 50 51 52 53 54 55 56 57 xx xx xx xx xx xx xx xx
+ // 60 61 62 63 64 65 66 67 xx xx xx xx xx xx xx xx
+ // 70 71 72 73 74 75 76 77 xx xx xx xx xx xx xx xx
+ partial[6] = v_src[0];
+ for (int i = 1; i < 8; ++i) {
+ partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+ }
+
+ AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+ AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+ AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+ a = _mm256_hadd_epi32(a, a);
+ a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+ return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+ const __m256i partial_4,
+ const __m256i division_table) {
+ const __m256i division_table_0 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x0);
+ const __m256i division_table_1 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+ // partial_lo
+ const __m256i a = partial_0;
+ // partial_hi
+ const __m256i b = partial_4;
+
+ // Reverse and clear upper 2 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+ static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+ // 14 13 12 11 10 09 08 ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[0] = _mm_cvtsi128_si32(sums);
+ cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table[2]) {
+ // partial_lo
+ const __m256i a = partial_a;
+ // partial_hi
+ const __m256i b = partial_b;
+
+ // Reverse and clear upper 10 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504));
+
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[index_a] = _mm_cvtsi128_si32(sums);
+ cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table) {
+ // The upper lane is a "don't care", so only use the lower lane for
+ // calculating cost.
+ const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+ const __m256i square_a = _mm256_madd_epi16(a, a);
+ const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+ const __m256i c = SumVectorPair_S32(b);
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+ cost[2] = _mm_cvtsi128_si32(sums);
+ cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+
+ // partial[0] = add partial 0,4 low
+ // partial[1] = add partial 1,3 low
+ // partial[2] = add partial 2 low
+ // partial[3] = add partial 1,3 high
+ // partial[4] = add partial 0,4 high
+ // partial[5] = add partial 7,5 high
+ // partial[6] = add partial 6 low
+ // partial[7] = add partial 7,5 low
+ __m256i partial[8];
+
+ AddPartial(src, stride, partial);
+
+ const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+ const __m256i division_table_7 =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+ Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+ Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+ const __m256i division_table_odd[2] = {
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+ CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+ CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+ const __m128i& damping, const __m256i& threshold) {
+ const __m256i diff = _mm256_sub_epi16(pixel, reference);
+ const __m256i abs_diff = _mm256_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m256i thresh_minus_shifted_diff =
+ _mm256_subs_epu16(threshold, shifted_diff);
+ const __m256i clamp_abs_diff =
+ _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+ const __m256i& tap, const __m128i& damping,
+ const __m256i& threshold) {
+ const __m256i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+ const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+ const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+ const __m256i secondary_tap_0 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+ const __m256i secondary_tap_1 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+ const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+ const __m256i primary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+ const __m256i secondary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+ int y = height;
+ do {
+ __m128i pixel_128;
+ if (width == 8) {
+ pixel_128 = LoadUnaligned16(src);
+ } else {
+ pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+ __m256i min = pixel;
+ __m256i max = pixel;
+ __m256i sum_pair;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val_128[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val_128, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val_128, direction);
+ }
+
+ __m256i primary_val[2];
+ primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+ primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, primary_val[0]);
+ min = _mm256_min_epu16(min, primary_val[1]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+ max = _mm256_max_epu16(
+ max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+ }
+
+ sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum_pair = _mm256_setzero_si256();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val_128[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+ }
+
+ __m256i secondary_val[4];
+ secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+ secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+ secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+ secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, secondary_val[0]);
+ min = _mm256_min_epu16(min, secondary_val[1]);
+ min = _mm256_min_epu16(min, secondary_val[2]);
+ min = _mm256_min_epu16(min, secondary_val[3]);
+
+ const __m256i max_s01 =
+ _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m256i max_s23 =
+ _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+ max = _mm256_max_epu8(max,
+ _mm256_and_si256(max_s, cdef_large_value_mask));
+ }
+
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+
+ __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+ _mm256_extracti128_si256(sum_pair, 1));
+
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+ if (clipping_required) {
+ const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+ _mm256_extracti128_si256(min, 1));
+
+ const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+ _mm256_extracti128_si256(max, 1));
+ // Clip3
+ sum = _mm_min_epi16(sum, max_128);
+ sum = _mm_max_epi16(sum, min_128);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_AVX2;
+
+ dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+ 420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_d1_temp[8];
+ const __m128i v_zero = _mm_setzero_si128();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t stride, __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m128i v_src[8];
+ for (auto& i : v_src) {
+ i = LoadLo8(src);
+ src += stride;
+ }
+
+ const __m128i v_zero = _mm_setzero_si128();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+ const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+ const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+ const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+ const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial_lo[2] =
+ _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ __m128i v_src_16[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ partial_lo[6] = v_src_16[0];
+ for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+ __m128i v_src_reverse[8];
+ const __m128i reverser =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+ a = _mm_hadd_epi32(a, a);
+ a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+ return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 2 bytes.
+ const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+ 0x03020504, 0x07060908, 0x0b0a0d0c);
+ // 14 13 12 11 10 09 08 ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 10 bytes.
+ const __m128i reverser =
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504);
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][10 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+ const __m128i square = _mm_madd_epi16(a, a);
+ return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+ __m128i partial_lo[8], partial_hi[8];
+
+ AddPartial(src, stride, partial_lo, partial_hi);
+
+ cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+ cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+ const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+ LoadUnaligned16(kCdefDivisionTable + 4)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const __m128i division_table_odd[2] = {
+ LoadAligned16(kCdefDivisionTableOddPadded),
+ LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+ cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+ cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+ cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+ cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+ const __m128i& damping, const __m128i& threshold) {
+ const __m128i diff = _mm_sub_epi16(pixel, reference);
+ const __m128i abs_diff = _mm_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m128i thresh_minus_shifted_diff =
+ _mm_subs_epu16(threshold, shifted_diff);
+ const __m128i clamp_abs_diff =
+ _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+ const __m128i& tap, const __m128i& damping,
+ const __m128i& threshold) {
+ const __m128i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+
+ const __m128i primary_tap_0 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+ const __m128i primary_tap_1 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+ const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+ const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+ const __m128i cdef_large_value_mask =
+ _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+ const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+ const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+ int y = height;
+ do {
+ __m128i pixel;
+ if (width == 8) {
+ pixel = LoadUnaligned16(src);
+ } else {
+ pixel = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m128i min = pixel;
+ __m128i max = pixel;
+ __m128i sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, primary_val[0]);
+ min = _mm_min_epu16(min, primary_val[1]);
+ min = _mm_min_epu16(min, primary_val[2]);
+ min = _mm_min_epu16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+ const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+ const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+ max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+ }
+
+ sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum = _mm_setzero_si128();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, secondary_val[0]);
+ min = _mm_min_epu16(min, secondary_val[1]);
+ min = _mm_min_epu16(min, secondary_val[2]);
+ min = _mm_min_epu16(min, secondary_val[3]);
+ min = _mm_min_epu16(min, secondary_val[4]);
+ min = _mm_min_epu16(min, secondary_val[5]);
+ min = _mm_min_epu16(min, secondary_val[6]);
+ min = _mm_min_epu16(min, secondary_val[7]);
+
+ const __m128i max_s01 =
+ _mm_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m128i max_s23 =
+ _mm_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m128i max_s45 =
+ _mm_max_epu8(secondary_val[4], secondary_val[5]);
+ const __m128i max_s67 =
+ _mm_max_epu8(secondary_val[6], secondary_val[7]);
+ const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+ _mm_max_epu8(max_s45, max_s67));
+ max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+ }
+
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, pixel);
+ if (clipping_required) {
+ // Clip3
+ sum = _mm_min_epi16(sum, max);
+ sum = _mm_max_epi16(sum, min);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_SSE4_1;
+ dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace libgav1 {
+namespace dsp {
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_AVX2
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+ // For compatibility with older gcc toolchains (< 8) use
+ // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+ // are implemented similarly to the following, clang uses a different method
+ // but no differences in assembly have been observed.
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+ dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+ return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+ if (over_read_in_bytes > 0) {
+ __m128i m = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+ m = _mm_srli_si128(m, 1);
+ }
+ const __m256i mask = (over_read_in_bytes < 16)
+ ? SetrM128i(_mm_set1_epi8(-1), m)
+ : SetrM128i(m, _mm_setzero_si128());
+ dst = _mm256_and_si256(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i dst[2]) {
+ dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+ dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+ over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+ _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m256i v_bias_d =
+ _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+ return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+ return _mm256_srai_epi32(v_tmp_d, bits);
+}
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_avx2_test.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <cstdint>
+
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+void AVX2RightShiftWithRoundingS16Test() {
+ for (int bits = 0; bits < 16; ++bits) {
+ const int bias = (1 << bits) >> 1;
+ for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+ const __m256i v_val_d = _mm256_set1_epi16(value);
+ const __m256i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+ // Note _mm256_extract_epi16 is avoided for compatibility with Visual
+ // Studio < 2017.
+ const int16_t result =
+ _mm_extract_epi16(_mm256_extracti128_si256(v_result_d, 0), 0);
+ const int32_t expected = RightShiftWithRounding(value, bits);
+ if (value <= INT16_MAX - bias) {
+ EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+ } else {
+ EXPECT_EQ(expected, 1 << (15 - bits));
+ EXPECT_EQ(result, -expected)
+ << "value: " << value << ", bits: " << bits;
+ }
+ }
+ }
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test() {
+ GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable "
+ "the tests.";
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_AVX2
--- /dev/null
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void AVX2RightShiftWithRoundingS16Test();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_TEST_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+ int n;
+ union {
+ __m128i r;
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i32[4];
+ uint64_t i64[2];
+ } tmp;
+ tmp.r = r;
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+ } else {
+ for (n = 0; n < 2; ++n)
+ fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+ }
+ fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+ fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+ fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+ const size_t size) {
+ fprintf(stderr, "Shadow for %s:\n", name);
+ __msan_print_shadow(r, size);
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif // LIBGAV1_MSAN
+
+#endif // 0
+
+namespace libgav1 {
+namespace dsp {
+namespace sse4 {
+
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace sse4
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+ int16_t val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+ uint16_t val1;
+ uint16_t val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ int16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val1, val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+ const __m128 x =
+ _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+ return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+ _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+ assert(bits < 16);
+ const __m128i v_bias_d =
+ _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+ int bits) {
+ const __m128i v_bias_d =
+ _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+ static constexpr uint8_t kMask[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ };
+
+ return LoadUnaligned16(kMask + n);
+}
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_sse4_test.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <cstdint>
+
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+void SSE41RightShiftWithRoundingS16Test() {
+ for (int bits = 0; bits < 16; ++bits) {
+ const int bias = (1 << bits) >> 1;
+ for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+ const __m128i v_val_d = _mm_set1_epi16(value);
+ const __m128i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+ const int16_t result = _mm_extract_epi16(v_result_d, 0);
+ const int32_t expected = RightShiftWithRounding(value, bits);
+ if (value <= INT16_MAX - bias) {
+ EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+ } else {
+ EXPECT_EQ(expected, 1 << (15 - bits));
+ EXPECT_EQ(result, -expected)
+ << "value: " << value << ", bits: " << bits;
+ }
+ }
+ }
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test() {
+ GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
+ "the tests.";
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+// Copyright 2023 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
+
+namespace libgav1 {
+namespace dsp {
+
+void SSE41RightShiftWithRoundingS16Test();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_TEST_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int num_taps>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+ __m256i sum;
+ if (num_taps == 6) {
+ // 6 taps.
+ const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm256_add_epi16(sum, v_madd_65);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+ const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int num_taps>
+__m256i SumHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i v_src[4];
+ const __m256i src_long = *src;
+ const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+ const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+ if (num_taps == 6) {
+ // 6 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (num_taps == 8) {
+ // 8 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (num_taps == 2) {
+ // 2 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else {
+ // 4 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
+}
+
+template <int num_taps>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm256_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m256i HorizontalTaps8To16(const __m256i* const src,
+ const __m256i* const v_tap) {
+ const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int /*width*/,
+ const int height, const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // Horizontal passes only need to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (!is_compound) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (num_taps == 2) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+// Filter widths >= 4.
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const __m256i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ if (width >= 32) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+ const __m256i result =
+ HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+ LoadUnaligned16(&src[x + 24]));
+ const __m256i result2 =
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[x], result);
+ StoreAligned32(&dest16[x + 16], result2);
+ } else {
+ StoreUnaligned32(&dest16[x], result);
+ StoreUnaligned32(&dest16[x + 16], result2);
+ }
+ } else {
+ // Load src used to calculate dest8[7:0] and dest8[23:16].
+ const __m256i src_long = LoadUnaligned32(&src[x]);
+ const __m256i result =
+ SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+ // Load src used to calculate dest8[15:8] and dest8[31:24].
+ const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+ const __m256i result2 =
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
+ // Combine results and store.
+ StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+ }
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ } else if (width == 16) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ if (is_2d || is_compound) {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ const __m256i src_long2 =
+ SetrM128i(LoadUnaligned16(&src[src_stride]),
+ LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[0], result);
+ StoreAligned32(&dest16[pred_stride], result2);
+ } else {
+ StoreUnaligned32(&dest16[0], result);
+ StoreUnaligned32(&dest16[pred_stride], result2);
+ }
+ } else {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+ LoadUnaligned16(&src[src_stride]));
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(
+ LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
+ const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+ StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+ StoreUnaligned16(&dest8[pred_stride],
+ _mm256_extracti128_si256(packed_result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ StoreAligned32(&dest16[0], result);
+ }
+
+ } else if (width == 8) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ if (is_2d || is_compound) {
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreAligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ } else {
+ StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreUnaligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ }
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+ StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ }
+
+ } else { // width == 4
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ if (is_2d || is_compound) {
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
+ Store4(&dest8[0], _mm256_castsi256_si128(result));
+ Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m256i* v_tap) {
+ if (num_taps == 8) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
+ v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12)); // k7k6
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
+ }
+ } else if (num_taps == 6) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2)); // k2k1
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10)); // k6k5
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
+ }
+ } else if (num_taps == 4) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ }
+ } else { // num_taps == 2
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+ const __m256i* const taps) {
+ __m256i sum_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m256i sum_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m256i madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m256i madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m256i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m256i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned32(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ const __m128i packed_sum = _mm_packus_epi16(
+ _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+ StoreUnaligned16(dst8_x, packed_sum);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 16;
+ } while (x < width);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
+ assert(filter_id != 0);
+ __m256i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ if (width > 2) {
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+ width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH</*is_2d=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ }
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+ __m256i v_src[4];
+
+ if (!unpack_high) {
+ if (num_taps == 6) {
+ // 6 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ } else {
+ // 4 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ } else {
+ if (num_taps == 6) {
+ // 6 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ } else {
+ // 4 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ }
+ }
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m256i* const v_tap) {
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 32);
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m256i srcs[8];
+ srcs[0] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+ StoreUnaligned32(dst16_x, results);
+ StoreUnaligned32(dst16_x + 16, results_hi);
+ dst16_x += dst_stride;
+ } else {
+ const __m256i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+ StoreUnaligned32(dst8_x, packed_results);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 32;
+ } while (x < width);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m256i* const v_tap) {
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+ StoreUnaligned32(dst16, results);
+ StoreUnaligned32(dst16 + dst_stride, results_hi);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreUnaligned16(dst8, this_dst);
+ StoreUnaligned16(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m256i* const v_tap) {
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results = Compound1DShift(sums);
+ const __m128i this_dst = _mm256_castsi256_si128(results);
+ const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+ StoreUnaligned16(dst16, this_dst);
+ StoreUnaligned16(dst16 + dst_stride, next_dst);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreLo8(dst8, this_dst);
+ StoreLo8(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m128i* const v_tap) {
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += dst_stride;
+ } else {
+ const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8, _mm_packus_epi16(results, results));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ }
+ } else { // width <= 8
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
+ }
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ }
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ }
+ } else { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ }
+ }
+ }
+}
+
+void ConvolveCompoundVertical_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = width;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<6, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<6, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<6, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<8, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<8, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<8, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ }
+ } else { // width <= 4
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ }
+ }
+}
+
+void ConvolveHorizontal_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width > 2) {
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ }
+}
+
+void ConvolveCompoundHorizontal_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+ // Quiet compiler error.
+ (void)pred_stride;
+#endif
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+ dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+template <int num_taps>
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+ const __m128i* const v_tap) {
+ __m128i v_src[4];
+ const __m128i src_long = LoadUnaligned16(src);
+ const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+ const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+ if (num_taps == 6) {
+ // 6 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (num_taps == 8) {
+ // 8 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (num_taps == 2) {
+ // 2 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else {
+ // 4 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ return sum;
+}
+
+template <int num_taps>
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
+ const __m128i* const v_tap) {
+ const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d = false, bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // 4 tap filters are never used when width > 4.
+ if (num_taps != 4 && width > 4) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[x], v_sum);
+ } else {
+ StoreUnaligned16(&dest16[x], v_sum);
+ }
+ } else {
+ const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
+ StoreLo8(&dest8[x], result);
+ }
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ return;
+ }
+
+ // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (width == 4) {
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
+ StoreLo8(dest16, v_sum);
+ } else {
+ const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
+ Store4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ return;
+ }
+
+ if (!is_compound) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (num_taps == 2) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const v_tap) {
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16_x, results);
+ dst16_x += dst_stride;
+ } else {
+ const __m128i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+void ConvolveVertical_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // 4 tap
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int kRoundBitsVertical =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+ if (width >= 16) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&src[x]);
+ const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_src_ext_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+ const __m128i v_dest_lo =
+ _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+ const __m128i v_dest_hi =
+ _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+ StoreUnaligned16(&dest[x], v_dest_lo);
+ StoreUnaligned16(&dest[x + 8], v_dest_hi);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i v_src = LoadLo8(&src[0]);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreUnaligned16(&dest[0], v_dest);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else { /* width == 4 */
+ int y = height;
+ do {
+ const __m128i v_src0 = Load4(&src[0]);
+ const __m128i v_src1 = Load4(&src[src_stride]);
+ const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreLo8(&dest[0], v_dest);
+ StoreHi8(&dest[pred_stride], v_dest);
+ src += src_stride * 2;
+ dest += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 6) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (vertical_taps == 8) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (vertical_taps == 2) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else { // 4 tap
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ }
+}
+
+void ConvolveHorizontal_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ const ptrdiff_t dest_stride = width;
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+ // Filter 0
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+ {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+ // Filter 1
+ alignas(16) static constexpr int8_t
+ kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+ // Filter 2
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+ {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+ // Filter 3
+ alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+ // Filter 4
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+ {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+ // Filter 5
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+ switch (filter_index) {
+ case 0:
+ output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+ break;
+ case 1:
+ // The term "mixed" refers to the fact that the outer taps have a mix of
+ // negative and positive values.
+ output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+ break;
+ case 2:
+ output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+ output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+ output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+ break;
+ case 3:
+ output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+ break;
+ case 4:
+ output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+ break;
+ default:
+ assert(filter_index == 5);
+ output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+ output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+ output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+ output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+ break;
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+ const __m128i src_indices,
+ __m128i* const source /*[num_taps >> 1]*/) {
+ // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+ // msan because it incorrectly models the outcome of the shuffles in some
+ // cases. This has not been reproduced out of context.
+ const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+ const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
+ source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+ if (grade_x == 1) {
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+ }
+ if (num_taps > 4) {
+ source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+ }
+ } else {
+ assert(grade_x > 1);
+ assert(num_taps != 4);
+ // grade_x > 1 also means width >= 8 && num_taps != 4
+ const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+ src_indices);
+ source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+ src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+ src_indices);
+ }
+ }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+ const __m128i* filter_taps,
+ __m128i* out_taps) {
+ const __m128i scale_index_offsets =
+ _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+ const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+ const __m128i filter_indices =
+ _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+ filter_index_mask);
+ // Line up taps for maddubs_epi16.
+ // The unpack is also assumed to be lighter than shift+alignr.
+ for (int k = 0; k < (num_taps >> 1); ++k) {
+ const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+ const __m128i taps1 =
+ _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+ out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+ }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+ const __m128i src_indices16 =
+ _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+ const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+ return _mm_unpacklo_epi8(src_indices,
+ _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t src_stride, int width,
+ int subpixel_x, int step_x,
+ int intermediate_height,
+ int16_t* LIBGAV1_RESTRICT intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = (8 - num_taps) >> 1;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ __m128i filter_taps[num_taps];
+ GetHalfSubPixelFilter<filter_index>(filter_taps);
+ const __m128i index_steps =
+ _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+ _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+ __m128i taps[num_taps >> 1];
+ __m128i source[num_taps >> 1];
+ int p = subpixel_x;
+ // Case when width <= 4 is possible.
+ if (filter_index >= 3) {
+ if (filter_index > 3 || width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // Load and line up source values with the taps. Width 4 means no need
+ // to load extended source.
+ PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+ source);
+
+ StoreLo8(intermediate, RightShiftWithRounding_S16(
+ SumOnePassTaps<num_taps>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+ }
+
+ // |width| >= 8
+ int16_t* intermediate_x = intermediate;
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // For each x, a lane of src_k[k] contains src_x[k].
+ PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+ // Shift by one less because the taps are halved.
+ StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+ SumOnePassTaps<num_taps>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+ __m128i* output) {
+ // Avoid overreading the filter due to starting at kernel_offset.
+ // The only danger of overread is in the final filter, which has 4 taps.
+ const __m128i filter =
+ _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+ output[0] = _mm_shuffle_epi32(filter, 0);
+ if (num_taps > 2) {
+ output[1] = _mm_shuffle_epi32(filter, 0x55);
+ }
+ if (num_taps > 4) {
+ output[2] = _mm_shuffle_epi32(filter, 0xAA);
+ }
+ if (num_taps > 6) {
+ output[3] = _mm_shuffle_epi32(filter, 0xFF);
+ }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+ const __m128i* taps) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+ }
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+ const __m128i* taps_hi) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+ const int intermediate_height,
+ const int width, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ constexpr int kernel_offset = (8 - num_taps) / 2;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ auto* dest16_y = static_cast<uint16_t*>(dest);
+ auto* dest_y = static_cast<uint8_t*>(dest);
+ __m128i s[num_taps];
+
+ int p = subpixel_y & 1023;
+ int y = height;
+ if (width_class <= 4) {
+ __m128i filter_taps_lo[num_taps >> 1];
+ __m128i filter_taps_hi[num_taps >> 1];
+ do { // y > 0
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadLo8(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter0 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadHi8(s[i], src_y + i * src_stride);
+ }
+ filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter1 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+ s, filter_taps_lo, filter_taps_hi);
+ if (is_compound) {
+ assert(width_class > 2);
+ StoreLo8(dest16_y, sums);
+ dest16_y += dest_stride;
+ StoreHi8(dest16_y, sums);
+ dest16_y += dest_stride;
+ } else {
+ const __m128i result = _mm_packus_epi16(sums, sums);
+ if (width_class == 2) {
+ Store2(dest_y, result);
+ dest_y += dest_stride;
+ Store2(dest_y, _mm_srli_si128(result, 4));
+ } else {
+ Store4(dest_y, result);
+ dest_y += dest_stride;
+ Store4(dest_y, _mm_srli_si128(result, 4));
+ }
+ dest_y += dest_stride;
+ }
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ // |width_class| >= 8
+ __m128i filter_taps[num_taps >> 1];
+ int x = 0;
+ do { // x < width
+ auto* dest_y = static_cast<uint8_t*>(dest) + x;
+ auto* dest16_y = static_cast<uint16_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int y = height;
+ do { // y > 0
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadUnaligned16(src_y + i * src_stride);
+ }
+
+ const __m128i sums =
+ Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+ if (is_compound) {
+ StoreUnaligned16(dest16_y, sums);
+ } else {
+ StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
+ }
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+ src += kIntermediateStride * intermediate_height;
+ x += 8;
+ } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y, const int width,
+ const int height, void* LIBGAV1_RESTRICT prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ alignas(16) int16_t
+ intermediate_result[kIntermediateAllocWidth *
+ (2 * kIntermediateAllocWidth + kSubPixelTaps)];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+ memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
+ const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // second register and alignr in order to gather all filter inputs.
+ // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+ const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (horiz_filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ break;
+ default:
+ assert(horiz_filter_index == 5);
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+
+ // Vertical filter.
+ intermediate = intermediate_result;
+ switch (vert_filter_index) {
+ case 0:
+ case 1:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<6, 4, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 8, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ }
+ break;
+ case 2:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<8, 4, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 8, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ }
+ break;
+ case 3:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<2, 4, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 8, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ }
+ break;
+ default:
+ assert(vert_filter_index == 4 || vert_filter_index == 5);
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<4, 4, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 8, is_compound>(
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
+ }
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ uint8_t* LIBGAV1_RESTRICT dst) {
+ const __m128i left = LoadUnaligned16(src);
+ const __m128i right = LoadUnaligned16(src + 1);
+ StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+ const int /*subpixel_y*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i left = LoadLo8(src);
+ const __m128i right = LoadLo8(src + 1);
+ StoreLo8(dest, _mm_avg_epu8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 4) {
+ int y = height;
+ do {
+ __m128i left = Load4(src);
+ __m128i right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ assert(width == 2);
+ __m128i left = _mm_setzero_si128();
+ __m128i right = _mm_setzero_si128();
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 2));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ __m128i row[8], below[8];
+
+ row[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = LoadUnaligned16(src);
+ src += 16;
+ row[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = LoadUnaligned16(src);
+ src += 16;
+ row[5] = LoadUnaligned16(src);
+ src += 16;
+ row[6] = LoadUnaligned16(src);
+ src += 16;
+ row[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = LoadUnaligned16(src);
+ src += 16;
+ below[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = LoadUnaligned16(src);
+ src += 16;
+ below[5] = LoadUnaligned16(src);
+ src += 16;
+ below[6] = LoadUnaligned16(src);
+ src += 16;
+ below[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ __m128i row, below;
+ row = LoadLo8(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = LoadLo8(src);
+ src += reference_stride;
+
+ StoreLo8(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else if (width == 4) {
+ __m128i row = Load4(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ __m128i below = Load4(src);
+ src += reference_stride;
+
+ Store4(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else {
+ assert(width == 2);
+ __m128i row = Load2(src);
+ __m128i below = _mm_setzero_si128();
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+ const uint8_t* LIBGAV1_RESTRICT src1) {
+ const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+ return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+ const __m128i a = _mm_add_epi16(v0, v1);
+ const __m128i b = _mm_srli_epi16(a, 1);
+ // Use avg here to shift right by 1 with round.
+ const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+ return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ __m128i row[16];
+ row[0] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 16) {
+ src += 8;
+ row[1] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 32) {
+ src += 8;
+ row[2] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[3] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 64) {
+ src += 8;
+ row[4] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[5] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[6] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[7] = LoadU8AndAddLong(src, src + 1);
+ if (width == 128) {
+ src += 8;
+ row[8] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[9] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[10] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[11] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[12] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[13] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[14] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[15] = LoadU8AndAddLong(src, src + 1);
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ __m128i left = _mm_cvtepu8_epi16(Load4(src));
+ __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+ src += reference_stride;
+
+ __m128i row = _mm_add_epi16(left, right);
+
+ int y = height;
+ do {
+ left = Load4(src);
+ right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ } else {
+ __m128i left = Load2(src);
+ __m128i right = Load2(src + 1);
+ src += reference_stride;
+
+ __m128i row =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+ dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+ if (filter_index == 0) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 1) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+ (filter_id == 8) | (filter_id == 9)) != 0) {
+ return 6;
+ }
+ // When |filter_index| == 1, the |filter_id| values not listed above map to
+ // 4 tap filters.
+ return 4;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int num_taps>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+ __m128i sum;
+ if (num_taps == 6) {
+ // 6 taps.
+ const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm_add_epi16(sum, v_madd_65);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+ const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int num_taps>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+ if (num_taps == 2) {
+ // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+ const __m128i v_src_43 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ const __m128i v_src_32 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+ // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+ const __m128i v_src_54 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+ static_cast<int>(0x80070706), 0x06050504));
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int num_taps>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int num_taps>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y -= 4;
+ } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (num_taps == 6) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (num_taps == 8) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (num_taps == 2) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ return sum;
+}
+
+template <int num_taps, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = height;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = height;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = height;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int num_taps, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = height;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = height;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ }
+}
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+ // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+ const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+ // (x << 11) >> 15 == x >> 4
+ const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
+
+ for (int y = 0; y < height; y += 4) {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ Store4(dst, result_pixels);
+ dst += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dst, &result_1, sizeof(result_1));
+ dst += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dst, &result_2, sizeof(result_2));
+ dst += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dst, &result_3, sizeof(result_3));
+ dst += dest_stride;
+ }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ StoreLo8(dst, result_pixels);
+ dst += dest_stride;
+ StoreHi8(dst, result_pixels);
+ dst += dest_stride;
+ }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+ StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+ x += 16;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const uint8_t weight_0,
+ const uint8_t /*weight_1*/, const int width,
+ const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const uint8_t weight = weight_0;
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+ dest_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+ dest_stride);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weight0,
+ const __m128i& weight1) {
+ // This offset is a combination of round_factor and round_offset
+ // which are to be added and subtracted respectively.
+ // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+ constexpr int offset =
+ (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bias = _mm_set1_epi32(offset);
+ const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+ __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+ __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+ __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+ __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+ __m128i sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+ prediction0 = _mm_unpackhi_epi16(pred0, zero);
+ mult0 = _mm_mullo_epi32(prediction0, weight0);
+ prediction1 = _mm_unpackhi_epi16(pred1, zero);
+ mult1 = _mm_mullo_epi32(prediction1, weight1);
+ sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+ const __m128i pack = _mm_packus_epi32(result0, result1);
+
+ return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+ StoreLo8(dst, res0);
+ dst += dest_stride;
+ StoreHi8(dst, res0);
+ dst += dest_stride;
+ StoreLo8(dst, res1);
+ dst += dest_stride;
+ StoreHi8(dst, res1);
+ dst += dest_stride;
+ y -= 4;
+ } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+ StoreUnaligned16(dst, res0);
+ dst += dest_stride;
+ StoreUnaligned16(dst, res1);
+ dst += dest_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+ StoreUnaligned16(dst + x, res_lo);
+ x += 8;
+ StoreUnaligned16(dst + x, res_hi);
+ x += 8;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+ height, dest, dst_stride);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+ return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+ return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+ return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+ StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+ StoreUnaligned16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16(luma);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+ }
+ return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(
+ LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+ LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+ 1);
+ }
+ return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+ const __m128i high) {
+ const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+ return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(const int16_t* scaling_lut,
+ const Pixel* source) {
+ alignas(16) int16_t start_vals[8];
+ static_assert(bitdepth <= kBitdepth10,
+ "SSE4 Film Grain is not yet implemented for 12bpp.");
+ for (int i = 0; i < 8; ++i) {
+ assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return LoadAligned16(start_vals);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+ const __m128i scaling_shift) {
+ const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+ return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_luma);
+ const int safe_width = width & ~7;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_width; x += 8) {
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+
+ if (x < width) {
+ Pixel luma_buffer[8];
+ // Prevent arbitrary indices from entering GetScalingFactors.
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ const int valid_range = width - x;
+ assert(valid_range < 8);
+ memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+ in_y_row += source_stride_y;
+ out_y_row += dest_stride_y;
+ } while (++y < height);
+ out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+ const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+ const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+ const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+ const __m128i scaling_shift) {
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const __m128i orig = LoadSource(chroma_cursor);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+ const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+ Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+ alignas(16) Pixel luma_buffer[16];
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+ // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+ // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+ // in GetScalingFactors.
+ Pixel average_luma_buffer[8];
+ assert(start_height % 2 == 0);
+ start_height >>= subsampling_y;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ if (x < chroma_width) {
+ // Prevent huge indices from entering GetScalingFactors due to
+ // uninitialized values. This is not a problem in 8bpp because the table
+ // is made larger than 255 values.
+ if (bitdepth > kBitdepth8) {
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ }
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+ BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+ source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+ const int16_t* scaling_lut, const __m128i& orig,
+ const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const __m128i& average_luma, const __m128i& scaling_shift,
+ const __m128i& offset, const __m128i& weights) {
+ uint8_t merged_buffer[8];
+ const __m128i combined_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+ const __m128i combined_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+ const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+ _mm_srai_epi32((combined_hi), 6));
+
+ const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+ StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+ const __m128i scaling =
+ GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+ const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
+ const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+ // will need to be guarded from overread, even if |chroma_width| is a
+ // multiple of 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+ alignas(16) uint8_t luma_buffer[16];
+ const __m128i offset = _mm_set1_epi16(chroma_offset);
+ const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+ (luma_multiplier & 0xFFFF));
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x + 8 <= safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ // There is no need to pre-initialize this buffer, because merged values
+ // used as indices are saturated in the 8bpp case. Uninitialized values
+ // are written outside the frame.
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const int valid_range_chroma = chroma_width - x;
+ uint8_t chroma_buffer[8];
+ memcpy(chroma_buffer, &in_chroma_row[x],
+ valid_range_chroma * sizeof(in_chroma_row[0]));
+
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ const __m128i orig_chroma =
+ LoadSourceMsan(chroma_buffer, valid_range_chroma);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ // End of right edge iteration.
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane8bpp_SSE4_1(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+ source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+ film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+ {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ // Samples matched with the '4' tap, expanded to 16-bit.
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ // Samples matched with the '8' tap, expanded to 16-bit.
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+ // Apply the taps by shifting.
+ const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+ const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+ const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+ const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+ // Move latter 4x values down to add with first 4x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+ // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+ const __m128i outers5_lo =
+ _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+ const __m128i outers5_hi =
+ _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+ // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+ const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+ _mm_slli_epi16(centers_lo, 2));
+ const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+ _mm_slli_epi16(centers_hi, 2));
+ // Move latter 5x values down to add with first 5x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+ // Shift latter 5x values to add with first 5x values for each output.
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+ // First 6 values are valid outputs.
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+ // Finish |edge_lo| life cycle quickly.
+ // Multiply for 2x.
+ const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+ // Finish |source2| life cycle quickly.
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+ // First 4x values already aligned to add with running total.
+ sum = _mm_add_epi16(sum, source4_lo);
+ // Move second 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+ // Move third 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+ // Multiply for 2x.
+ const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+ // First 4x values already aligned to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+ // Move second 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+ // Move third 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+ // Because we have only 8 values here, it is safe to align before packing down
+ // to 8-bit without losing data.
+ sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+ sum = RightShiftWithRounding_U16(sum, 4);
+ StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+ uint8_t edge[kMaxEdgeBufferSize + 4];
+ memcpy(edge, buffer, size);
+ auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+ // Only process |size| - 1 elements. Nothing to do in this case.
+ if (size == 1) return;
+
+ int i = 0;
+ switch (strength) {
+ case 1:
+ // To avoid overwriting, we stop short from the total write size plus the
+ // initial offset. In this case 12 valid values are written in two blocks
+ // of 8 bytes each.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ case 2:
+ // See the comment for case 1.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ default:
+ assert(strength == 3);
+ // The first filter input is repeated for taps of value 2 and 4.
+ dst_buffer[1] = RightShiftWithRounding(
+ (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+ // In this case, one block of 8 bytes is written in each iteration, with
+ // an offset of 2.
+ for (; i < size - 10; i += 8) {
+ ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+ }
+ }
+ const int kernel_index = strength - 1;
+ for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+ ++final_index) {
+ int sum = 0;
+ for (int j = 0; j < kKernelTaps; ++j) {
+ const int k = Clip3(final_index + j - 2, 0, size - 1);
+ sum += kKernels[kernel_index][j] * edge[k];
+ }
+ dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+ }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+ assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+ auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+ uint8_t temp[kMaxUpsampleSize + 8];
+ temp[0] = temp[1] = pixel_buffer[-1];
+ memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+ temp[size + 2] = pixel_buffer[size - 1];
+
+ pixel_buffer[-2] = temp[0];
+ const __m128i data = LoadUnaligned16(temp);
+ const __m128i src_lo = _mm_cvtepu8_epi16(data);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+ const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+ const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+ __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+ sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+ sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+ sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+ const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+ _mm_srli_si128(data, 2));
+ StoreUnaligned16(pixel_buffer - 1, result_lo);
+ if (size > 8) {
+ const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+ const __m128i src9_hi_extra =
+ _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+ __m128i sum_hi =
+ _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+ sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+ sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+ sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+ const __m128i result_hi =
+ _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+ StoreUnaligned16(pixel_buffer + 15, result_hi);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+ dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const int kCflLumaBufferStrideLog2_16i = 5;
+ const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4) {
+ Store4(dst, res);
+ } else {
+ StoreLo8(dst, res);
+ }
+ } else {
+ __m128i next =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst, res);
+ if (width == 32) {
+ res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst + 16, res);
+ }
+ }
+ dst += stride;
+ } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint8_t*>(source);
+ __m128i sum = _mm_setzero_si128();
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i samples;
+ int y = 0;
+ do {
+ samples = Load4(src);
+ src += stride;
+ int src_bytes;
+ memcpy(&src_bytes, src, 4);
+ samples = _mm_insert_epi32(samples, src_bytes, 1);
+ src += stride;
+ samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+
+ // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+ // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!is_inside) {
+ // Replicate the 2 high lanes.
+ samples = _mm_shuffle_epi32(samples, 0xee);
+ do {
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+ sum = _mm_cvtepu16_epi32(sum);
+ sum = _mm_add_epi32(sum, sum_tmp);
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 2 /* log2 of width 4 */);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ static_cast<void>(max_luma_width);
+ constexpr int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ const int block_height = 1 << block_height_log2, block_width = 8;
+ const int visible_height = max_luma_height;
+ const int invisible_width = inside ? 0 : block_width - max_luma_width;
+ const int visible_width = max_luma_width;
+ const __m128i blend_mask =
+ inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ // Since the maximum height is 32, if we split them by parity, each one only
+ // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+ // store them in 16 bits without casting to 32 bits.
+ __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+ __m128i sum;
+ __m128i samples1;
+
+ int y = 0;
+ do {
+ __m128i samples0 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border0 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+ }
+ src += stride;
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+ StoreUnaligned16(luma_ptr, samples0);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_even = _mm_add_epi16(sum_even, samples0);
+
+ samples1 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border1 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+ }
+ src += stride;
+ samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height; y += 2) {
+ sum_even = _mm_add_epi16(sum_even, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+ }
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+ _mm_cvtepu16_epi32(sum_even));
+ sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+ sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 3 /* log2 of width 8 */);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+
+ const int visible_height = max_luma_height;
+ const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+ const int invisible_width_16 = 16 - visible_width_16;
+ const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+ const int visible_width_32 = inside ? 32 : max_luma_width;
+ const int invisible_width_32 = 32 - visible_width_32;
+ const __m128i blend_mask_32 =
+ MaskHighNBytes(std::min(16, invisible_width_32));
+
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = _mm_setzero_si128();
+
+ __m128i samples0, samples1;
+ __m128i samples2, samples3;
+ __m128i inner_sum_lo, inner_sum_hi;
+ int y = 0;
+ do {
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
+
+ if (!inside) {
+ const __m128i border16 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+ samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+ }
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+ samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+
+ StoreUnaligned16(luma_ptr, samples0);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+ if (block_width == 32) {
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
+ if (!inside) {
+ const __m128i border32 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+ samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+ }
+ samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+ samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ inner_sum = _mm_add_epi16(samples2, inner_sum);
+ inner_sum = _mm_add_epi16(samples3, inner_sum);
+ }
+
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ luma_ptr += kCflLumaBufferStride;
+ src += stride;
+ } while (++y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height;
+ luma_ptr += kCflLumaBufferStride, ++y) {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ StoreUnaligned16(luma_ptr, samples0);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ if (block_width == 32) {
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ }
+ }
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ for (int x = 0; x < block_width; x += 8) {
+ __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+ do {
+ // Note that double sampling and converting to 16bit makes a row fill the
+ // vector.
+ const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+
+ do {
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row00);
+ src += stride;
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row10);
+ src += stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row20);
+ src += stride;
+ const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row30);
+ src += stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row40);
+ src += stride;
+ const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row50);
+ src += stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row60);
+ src += stride;
+ const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row70);
+ src += stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ static_assert(max_luma_width <= 32, "");
+
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 32.
+ int y = 0;
+ do {
+ const uint8_t* src_next = src + stride;
+ const __m128i samples_row0_lo = LoadUnaligned16(src);
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row0_hi)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row1_hi)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ sum = _mm_add_epi16(sum, wide_fill);
+ sum = _mm_add_epi16(sum, wide_fill);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ src += stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < luma_height);
+
+ // Begin second y section.
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ constexpr int kCflLumaBufferStrideLog2_16i = 5;
+ constexpr int kCflLumaBufferStrideLog2_128i =
+ kCflLumaBufferStrideLog2_16i - 3;
+ constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ const __m128i min = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+ stride >>= 1;
+
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ res = ClipEpi16(res, min, max);
+ if (width == 4) {
+ StoreLo8(dst, res);
+ } else if (width == 8) {
+ StoreUnaligned16(dst, res);
+ } else if (width == 16) {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ } else {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ const __m128i res_2 =
+ CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+ const __m128i res_3 =
+ CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+ }
+
+ dst += stride;
+ } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadHi8(LoadLo8(src), src + src_stride);
+ src += src_stride << 1;
+ sum = _mm_add_epi16(sum, samples);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples = _mm_unpackhi_epi64(samples, samples);
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadLo8(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadUnaligned16(src);
+ src += src_stride;
+ sum = _mm_add_epi16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadUnaligned16(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = zero;
+ __m128i inner_sum_lo, inner_sum_hi;
+ __m128i samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = LoadUnaligned16(src);
+ samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+ : LastRowResult(samples[0]);
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+ : LastRowResult(samples[2]);
+
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ do {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ __m128i samples_ext = zero;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ samples[idx] = LoadUnaligned16(&src[x]);
+ samples[idx] = _mm_slli_epi16(samples[idx], 3);
+ samples_ext = samples[idx];
+ } else {
+ samples[idx] = LastRowResult(samples_ext);
+ }
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row0 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row1 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row3 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row5 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row7 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const __m128i samples_row10 = LoadUnaligned16(src);
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = LoadUnaligned16(src);
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const __m128i samples_row30 = LoadUnaligned16(src);
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = LoadUnaligned16(src);
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const __m128i samples_row50 = LoadUnaligned16(src);
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = LoadUnaligned16(src);
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const __m128i samples_row70 = LoadUnaligned16(src);
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src + 16)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? LoadUnaligned16(src + 24)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row10 = LoadUnaligned16(src_next);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? LoadUnaligned16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ final_sum = _mm_add_epi32(
+ final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ return;
+ }
+ int y = 0;
+ do {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ dst += stride;
+ memcpy(dst, top + offset + 4, width);
+ dst += stride;
+ memcpy(dst, top + offset + 5, width);
+ dst += stride;
+ memcpy(dst, top + offset + 6, width);
+ dst += stride;
+ memcpy(dst, top + offset + 7, width);
+ dst += stride;
+
+ offset += 8;
+ y += 8;
+ } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+ const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+ : _mm_set_epi64x(0, 0x0403030202010100);
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ int y = 0;
+ int top_x = xstep;
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadLo8(top + top_base_x);
+ const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+ prod = RightShiftWithRounding_U16(prod, rounding_bits);
+ // Replace pixels from invalid range with top-right corner.
+ prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+ Store4(dst, _mm_packus_epi16(prod, prod));
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ // Corner-only section of the row.
+ memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled);
+ return;
+ }
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+ return;
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+ return;
+ }
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ for (; x < width - 8;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(top_row + top_base_x);
+ } else {
+ const __m128i top_vals = LoadLo8(top_row + top_base_x);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+ upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[4];
+ for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadLo8(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadLo8(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ result_block[x] = _mm_packus_epi16(vals, vals);
+ }
+ const __m128i result = Transpose4x4_U8(result_block);
+ // This is result_row0.
+ Store4(dest, result);
+ dest += stride;
+ const int result_row1 = _mm_extract_epi32(result, 1);
+ memcpy(dest, &result_row1, sizeof(result_row1));
+ dest += stride;
+ const int result_row2 = _mm_extract_epi32(result, 2);
+ memcpy(dest, &result_row2, sizeof(result_row2));
+ dest += stride;
+ const int result_row3 = _mm_extract_epi32(result, 3);
+ memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler =
+ _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[8];
+ for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+ }
+ Transpose8x8_U16(result_block, result_block);
+ for (int y = 0; y < height; ++y) {
+ StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (width == 4 || height == 4) {
+ const ptrdiff_t stride4 = stride << 2;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<true>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+ ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ }
+ return;
+ }
+
+ const ptrdiff_t stride8 = stride << 3;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<true, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<false, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds) {
+ const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds,
+ const __m128i& bounds_selector) {
+ const __m128i max_dest_x_vect =
+ _mm_shuffle_epi8(zone_bounds, bounds_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+ const __m128i& shifts,
+ const __m128i& sampler) {
+ const __m128i src_vals = LoadUnaligned16(source);
+ __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+ const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+ // Left_column and sampler are both offset by 15 so the indices are always
+ // positive.
+ const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+ for (int y = 0; y < 4; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+ // can work as shuffle indices. Some values may be out of bounds, but their
+ // pred results will be masked over by top prediction.
+ sampler = _mm_add_epi8(sampler, positive_offset);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column + (y << upsample_shift), shifts, sampler);
+ Store4(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_set1_epi8(1);
+ const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+ for (int y = 0; y < 8; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+ // Offset the relative index because ystep is negative in Zone 2 and shuffle
+ // indices must be nonnegative.
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ sampler = _mm_add_epi8(sampler, denegation);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+ // The specification adds (y << 6) to left_y, which is subject to
+ // upsampling, but this puts sampler indices out of the 0-15 range. It is
+ // equivalent to offset the source address by (y << upsample_shift) instead.
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+ sampler);
+ StoreLo8(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+ top_x -= xstep;
+
+ int top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+ DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+ DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+ DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+ DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ __m128i y_selector = _mm_set1_epi32(0x01000100);
+ const __m128i index_increment = _mm_set1_epi32(0x02020202);
+ for (int y = 0; y < height; ++y,
+ y_selector = _mm_add_epi8(y_selector, index_increment),
+ dest += stride) {
+ top_x -= xstep;
+ const int top_base_x = top_x >> scale_bits_x;
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+ DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+ }
+}
+
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+ const __m128i& left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+ // Cover 8x4 case.
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ if (max_top_only_y == height) return;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ } else {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+ base_left_y, -ystep);
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ // All columns from |min_top_only_x| to the right will only need |top_row|
+ // to compute. This assumes minimum |xstep| is 3.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // Accumulate xstep across 8 rows.
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep8 = ystep << 3;
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+ const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+ int x = 0;
+
+ for (int left_offset = -left_base_increment; x < min_shuffle_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
+ }
+ for (int left_offset = -left_base_increment; x < min_top_only_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
+ }
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ const int xstep4 = xstep << 2;
+ const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+ __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep4 = ystep << 2;
+ const int left_base_increment4 = ystep4 >> 6;
+ // This is guaranteed to be less than 64, but accumulation may bring it past
+ // 64 for higher x values.
+ const int ystep_remainder4 = ystep4 & 0x3F;
+ const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+ const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which will go into the left_column offset.
+ // Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+ int x = 0;
+ // Loop over x for columns with a mixture of sources.
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+ left_y = _mm_add_epi16(left_y, increment_left4),
+ left_offset -= left_base_increment4) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 4.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute. Rounded up to the nearest multiple of 4.
+ const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ // Loop over y for mixed rows.
+ for (; y < min_left_only_y;
+ y += 4, dst_x += stride4,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+ top_x -= xstep4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+ left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_4x4<upsampled_top>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left-only rows, if any.
+ for (; y < height; y += 4, dst_x += stride4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ }
+ }
+ // Loop over top-only columns, if any.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0x33, 128);
+ memset(left_buffer, 0x44, 128);
+#endif
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ if (width == 4 || height == 4) {
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+ return;
+ }
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ return;
+ }
+ int y = height;
+ do {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+ dst += stride;
+
+ offset += 8;
+ y -= 8;
+ } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+ const __m128i& shifts,
+ const __m128i& top_indices,
+ const __m128i& final_top_val,
+ const __m128i& border_index) {
+ const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+ __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+ prod = _mm_hadd_epi16(prod, prod);
+ const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+ // only cmpgt is available.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int y = 0;
+ int top_x = xstep;
+ const __m128i max_shift = _mm_set1_epi16(32);
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadUnaligned16(top + top_base_x);
+ const __m128i pred =
+ CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+ max_base_x_vect);
+ StoreLo8(dst, pred);
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+ const __m128i& top_vals_0, const __m128i& top_vals_1,
+ const __m128i& sampler, const __m128i& shifts,
+ const __m128i& top_indices = _mm_setzero_si128(),
+ const __m128i& final_top_val = _mm_setzero_si128(),
+ const __m128i& border_index = _mm_setzero_si128()) {
+ constexpr int scale_int_bits = 5;
+ const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+ const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+ const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+ const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+ const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+ const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+ if (check_border) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+ }
+ return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+ const uint16_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+ // to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to |top_base_x|, it is used to mask values
+ // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+ // which is not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ // Corner-only section of the row.
+ Memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+ void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+ const int width, const int height, const int xstep, const bool upsampled) {
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ auto* dest = static_cast<uint16_t*>(dest_ptr);
+ stride /= sizeof(uint16_t);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ // Each base pixel paired with its following pixel, for hadd purposes.
+ const __m128i adjacency_shuffler = _mm_set_epi16(
+ 0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+ // This is equivalent to not shuffling at all.
+ const __m128i identity_shuffler = _mm_set_epi16(
+ 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+ // This represents a trade-off between code size and speed. When upsampled
+ // is true, no shuffle is necessary. But to avoid in-loop branching, we
+ // would need 2 copies of the main function body.
+ const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+ sampler);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled, sampler);
+ return;
+ }
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = height;
+ do {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (--y != 0);
+ return;
+ }
+
+ // General case. Blocks with width less than 32 do not benefit from x-wise
+ // loop splitting, but do benefit from using memset on appropriate rows.
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const __m128i& pixels,
+ const __m128i& taps_0_1, const __m128i& taps_2_3,
+ const __m128i& taps_4_5, const __m128i& taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+ // |output_half| contains 8 partial sums for f0-f7.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* unused half */ output);
+ Store4(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_ptr,
+ const uint8_t* LIBGAV1_RESTRICT const left_ptr,
+ FilterIntraPredictor pred, const int height) {
+ // Two filter kernels per vector.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+ __m128i top = Load4(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+ __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // Two sets of the same input pixels to apply two filters at once.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows, in
+ // order to fit the assumption that |left| has the next TL at position 8.
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ left = LoadLo8(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = Load4(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case at this point, we can assume that |left| has the
+ // next TL at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ }
+}
+
+void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column,
+ FilterIntraPredictor pred, const int width,
+ const int height) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = LoadLo8(top_ptr - 1);
+ __m128i left = _mm_slli_si128(Load4(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = Load4(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = Load4(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = Load4(dst - stride);
+ left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = Load4(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = Load4(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+ dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
+ const __m128i& left,
+ const __m128i& weights,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+ const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner) {
+ const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+ return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
+ const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner,
+ const __m128i& round) {
+ const __m128i pred_sum =
+ SmoothDirectionalSum8(pixels, weights, scaled_corner);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+ StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(
+ uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
+ const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
+ const __m128i& scaled_corner1, const __m128i& scaled_corner2,
+ const __m128i& round) {
+ const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+ const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+ const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+ const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+ const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+ StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
+ const __m128i& top, const __m128i& left,
+ const __m128i& weights_x,
+ const __m128i& weights_y,
+ const __m128i& scaled_bottom_left,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+ const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+ const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+ const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+ const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+ const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+ // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
+ const int height, __m128i* pixels) {
+ if (height == 4) {
+ pixels[1] = Load4(left);
+ } else if (height == 8) {
+ pixels[1] = LoadLo8(left);
+ } else {
+ pixels[1] = LoadUnaligned16(left);
+ }
+
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+ pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
+ const int height, __m128i* weight_h,
+ __m128i* weight_w) {
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i x_weights = Load4(weight_array);
+ weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+ if (height == 8) {
+ const __m128i y_weights = LoadLo8(weight_array + 4);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ } else if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+ weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+ }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+ const __m128i* weight_x,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+ : _mm_unpacklo_epi8(pixel[1], zero);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int i = 0; i < 8; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+ __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+ horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+ __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+ sum = _mm_add_epi32(vertical_pred, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 9);
+
+ sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+ Store4(dst, sum);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ const __m128i scale = _mm_set1_epi32(256);
+ // Fourth short is top_row[3].
+ const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+ // Fourth short is left_column[3].
+ const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ auto* dst = static_cast<uint8_t*>(dest);
+ // AV1 spec 7.11.2.6 (3) describes the sum:
+ // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+ // scaled_bottom[y] This could be a loop, but for the immediate value in the
+ // shuffles.
+ WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+ scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[2];
+ LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[4];
+ LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
+ const int height, __m128i* pixels) {
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+ pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+ pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+ pixels[3] = _mm_set1_epi16(above[7]);
+
+ if (height == 4) {
+ pixels[2] = Load4(left);
+ } else if (height == 8) {
+ pixels[2] = LoadLo8(left);
+ } else if (height == 16) {
+ pixels[2] = LoadUnaligned16(left);
+ } else {
+ pixels[2] = LoadUnaligned16(left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = LoadUnaligned16(left + 16);
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
+ const int height, __m128i* weight_w,
+ __m128i* weight_h) {
+ const int offset = (height < 8) ? 0 : 4;
+ __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ const __m128i inverter = _mm_set1_epi16(256);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+ if (height == 4) {
+ loaded_weights = _mm_srli_si128(loaded_weights, 4);
+ __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+ __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+ weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+ weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ loaded_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+ weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+ weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+ weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+ }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+ const __m128i* weights_y, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+ : _mm_unpacklo_epi8(pixels[2], zero);
+ __m128i y_select = _mm_set1_epi16(0x100);
+
+ for (int i = 0; i < height; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ const __m128i vertical_sum0 =
+ _mm_madd_epi16(pixels[0], interleaved_weights);
+ const __m128i vertical_sum1 =
+ _mm_madd_epi16(pixels[1], interleaved_weights);
+
+ __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+ horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+ const __m128i horizontal_sum0 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+ const __m128i horizontal_sum1 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+ __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+ sum0 = _mm_add_epi32(sum0, round);
+ sum0 = _mm_srai_epi32(sum0, 9);
+
+ __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+ sum1 = _mm_add_epi32(sum1, round);
+ sum1 = _mm_srai_epi32(sum1, 9);
+
+ sum0 = _mm_packus_epi16(sum0, sum1);
+ sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+ StoreLo8(dst, sum0);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+ __m128i weights_x[2], weights_y[4];
+ LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[8];
+ LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+ __m128i weights_x[2], weights_y[8];
+ LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+ false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+ true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+ const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value = _mm_set1_epi16(256);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+ const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+ const __m128i round = _mm_set1_epi32(256);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < height; ++y) {
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+ const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+ __m128i scaled_bottom_left =
+ _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+ const __m128i weight_left_y =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+ scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+ scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+ for (int x = 0; x < width; x += 8) {
+ const __m128i top_x = LoadLo8(top_ptr + x);
+ const __m128i weights_x = LoadLo8(sm_weights_w + x);
+ const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+ const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+ const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+ // Here opposite weights and pixels are multiplied, where the order of
+ // interleaving is indicated in the names.
+ __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+ __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+ // |scaled_bottom_left| is always scaled by the same weight each row, so
+ // we only derive |scaled_top_right| values here.
+ const __m128i inverted_weights_x =
+ _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+ const __m128i scaled_top_right =
+ _mm_mullo_epi16(inverted_weights_x, top_right);
+ const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+ const __m128i scaled_top_right_hi =
+ _mm_unpackhi_epi16(scaled_top_right, zero);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+ // The round value for RightShiftWithRounding was added with
+ // |scaled_bottom_left|.
+ pred_lo = _mm_srli_epi32(pred_lo, 9);
+ pred_hi = _mm_srli_epi32(pred_hi, 9);
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+ }
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x4_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_mask = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal32x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ const __m128i left2 =
+ _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal64x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
+ const int height, __m128i* pixels) {
+ __m128i top = Load4(above);
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ top = _mm_cvtepu8_epi16(top);
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
+ weight_array,
+ const int height, __m128i* weights) {
+ const __m128i inverter = _mm_set1_epi16(256);
+
+ if (height == 4) {
+ const __m128i weight = Load4(weight_array);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else if (height == 8) {
+ const __m128i weight = LoadLo8(weight_array + 4);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else {
+ const __m128i weight = LoadUnaligned16(weight_array + 12);
+ const __m128i zero = _mm_setzero_si128();
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ weights[2] = _mm_unpackhi_epi8(weight, zero);
+ weights[3] = _mm_sub_epi16(inverter, weights[2]);
+ }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride) {
+ const __m128i pred_round = _mm_set1_epi32(128);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int y = 0; y < height; ++y) {
+ const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+ const __m128i alternate_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+ // The madd instruction yields four results of the form:
+ // (top_row[x] * weight[y] + corner * inverted_weight[y])
+ __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+ sum = _mm_add_epi32(sum, pred_round);
+ sum = _mm_srai_epi32(sum, 8);
+ sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+ Store4(dst, sum);
+ dst += stride;
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+ __m128i weights[4];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+ dst += stride << 3;
+ WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical64x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+ const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+ return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+ const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+ const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+ DcPredFuncs_SSE4_1() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+ DirectionalPredFuncs_SSE4_1() = delete;
+
+ static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* /*left_column*/) {
+ const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+ const __m128i sum = top_sumfn(top_row);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+ const __m128i sum = left_sumfn(left_column);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i rounder =
+ _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+ const __m128i sum_top = top_sumfn(top_row);
+ const __m128i sum_left = left_sumfn(left_column);
+ const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+ if (width_log2 == height_log2) {
+ const __m128i dc =
+ _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+ storefn(dest, stride, dc);
+ } else {
+ const __m128i dc =
+ DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+ storefn(dest, stride, dc);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+ col_storefn(dest, stride, left_column);
+}
+
+} // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+ const __m128i vals = Load4(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+ const __m128i vals = LoadLo8(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals = LoadUnaligned16(ref);
+ const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref);
+ const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+ const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref_ptr);
+ const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+ const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+ const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+ const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ Store4(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ Store4(dst, dup32);
+ dst += stride;
+ const int row1 = _mm_extract_epi32(dup32, 1);
+ memcpy(dst, &row1, 4);
+ dst += stride;
+ const int row2 = _mm_extract_epi32(dup32, 2);
+ memcpy(dst, &row2, 4);
+ dst += stride;
+ const int row3 = _mm_extract_epi32(dup32, 3);
+ memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const __m128i col_data = Load4(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+ // shiftk is the smaller of width_log2 and height_log2.
+ // dc_mult corresponds to the ratio of the smaller block size to the larger.
+ using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+ DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+ using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+ DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+ using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+ DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+ using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+ DcStore8xH_SSE4_1<8>, 0, 0>;
+ using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+ DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+ using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+ DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+ using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+ DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+ using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+ DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+ using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+ DcStore16xH_SSE4_1<16>, 0, 0>;
+ using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+ DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+ using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+ DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+ using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+ DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+ using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+ DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+ using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+ DcStore32xH_SSE4_1<32>, 0, 0>;
+ using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+ DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+ using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+ DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+ using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+ DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+ using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+ DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+ const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i top_left_dists =
+ _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ // The sequence of 32-bit packed operations was found (see CL via blame) to
+ // outperform 16-bit operations, despite the availability of the packus
+ // function, when tested on a Xeon E7 v3.
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i pred = _mm_shuffle_epi8(
+ _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+ Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+ const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+ const __m128i top_left_dists =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ const __m128i pred = _mm_packus_epi16(
+ _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists,
+ const __m128i& left_dists,
+ const __m128i& left_dists_lo,
+ const __m128i& left_dists_hi,
+ const __m128i& top_left_diffs_lo,
+ const __m128i& top_left_diffs_hi) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+ const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+ const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+ const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+ const __m128i top_left_dists_lo =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+ const __m128i top_left_dists_hi =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+ const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+ _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+ const __m128i left_gt_top_left_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+ /* unused second arg for pack */ left_dists_hi);
+ const __m128i left_gt_top_left = _mm_alignr_epi8(
+ left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+ const __m128i not_select_top_lo =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top = _mm_alignr_epi8(
+ not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+ const __m128i left_leq_top =
+ _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+ const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+ const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi32(left);
+ const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_0 = _mm_cvtepu8_epi32(left);
+ const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+ const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+ const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+ const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+ const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+ const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi16(left);
+ const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+ Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = Load4(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const uint8_t top_left, const __m128i top,
+ const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top_left,
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+ const __m128i top, const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const ptrdiff_t stride16 = stride << 4;
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ dst += stride16;
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst, stride, top_left, top, left_1);
+ dst += stride16;
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ WritePaeth16x16(dst, stride, top_left, top, left_2);
+ dst += stride16;
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x8(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+} // NOLINT(readability/fn_size)
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+ }
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+ }
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+ }
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+ }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const __m128i col_data = LoadUnaligned16(column);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ const ptrdiff_t stride4 = stride << 2;
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 128; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+ const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+ const __m128i ones = _mm_set1_epi16(1);
+
+ // half_sum[31:0] = a1+a2
+ // half_sum[63:32] = a3+a4
+ const __m128i half_sum = _mm_madd_epi16(vals, ones);
+ // Place half_sum[63:32] in shift_sum[31:0].
+ const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+ return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+ int32_t stride, int32_t idx,
+ const __m128i* s) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (store_width == 16) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreUnaligned16(&dst[i * stride + idx], s[i]);
+ StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+ if (store_width == 8) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreLo8(&dst[i * stride + idx], s[i]);
+ StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+ int32_t stride, int32_t idx, __m128i* x) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (load_width == 16) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadUnaligned16(&src[i * stride + idx]);
+ x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+ }
+ }
+ if (load_width == 8) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadLo8(&src[i * stride + idx]);
+ x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+ }
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1);
+ const __m128i y = _mm_packs_epi32(y1, y1);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+ const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+ const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+ const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1_hi);
+ const __m128i y = _mm_packs_epi32(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+ const __m128i x = _mm_mulhrs_epi16(*b, psin);
+ const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+ __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(sin128 << 3);
+ const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+ const __m128i y = _mm_mulhrs_epi16(*a, psin);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+ __m128i x, y;
+ if (flip) {
+ y = _mm_adds_epi16(*b, *a);
+ x = _mm_subs_epi16(*b, *a);
+ } else {
+ x = _mm_adds_epi16(*a, *b);
+ y = _mm_subs_epi16(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+ bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+ const __m128i v_row_shift_add,
+ const __m128i v_row_shift) {
+ const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+ // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+ // overflow. Generate a mask for this case.
+ const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+ const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+ // Assume int16_t values.
+ const __m128i a = _mm_sra_epi16(x, v_row_shift);
+ // Assume uint16_t values.
+ const __m128i b = _mm_srl_epi16(x, v_row_shift);
+ // Select the correct shifted value.
+ return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_src =
+ (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const int16_t cos128 = Cos128(32);
+ const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+ // Expand to 32 bits to prevent int16_t overflows during the shift add.
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_cvtepi16_epi32(xy);
+ const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+ const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+ const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+ const __m128i c = _mm_sra_epi32(b, v_row_shift);
+ const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+ const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+ if (width == 4) {
+ StoreLo8(dst, xy_shifted);
+ } else {
+ for (int i = 0; i < width; i += 8) {
+ StoreUnaligned16(dst, xy_shifted);
+ dst += 8;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const __m128i v_src = LoadLo8(dst);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreLo8(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&dst[i]);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreUnaligned16(&dst[i], xy);
+ i += 8;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[4], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(s, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(s, s);
+ }
+ StoreDst<8, 4>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false);
+ HadamardRotation(&s[6], &s[7], true);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(s, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false);
+ HadamardRotation(&s[10], &s[11], true);
+ HadamardRotation(&s[12], &s[13], false);
+ HadamardRotation(&s[14], &s[15], true);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false);
+ HadamardRotation(&s[9], &s[10], false);
+ HadamardRotation(&s[12], &s[15], true);
+ HadamardRotation(&s[13], &s[14], true);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&s[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false);
+ HadamardRotation(&s[18], &s[19], true);
+ HadamardRotation(&s[20], &s[21], false);
+ HadamardRotation(&s[22], &s[23], true);
+ HadamardRotation(&s[24], &s[25], false);
+ HadamardRotation(&s[26], &s[27], true);
+ HadamardRotation(&s[28], &s[29], false);
+ HadamardRotation(&s[30], &s[31], true);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false);
+ HadamardRotation(&s[17], &s[18], false);
+ HadamardRotation(&s[20], &s[23], true);
+ HadamardRotation(&s[21], &s[22], true);
+ HadamardRotation(&s[24], &s[27], false);
+ HadamardRotation(&s[25], &s[26], false);
+ HadamardRotation(&s[28], &s[31], true);
+ HadamardRotation(&s[29], &s[30], true);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false);
+ HadamardRotation(&s[17], &s[22], false);
+ HadamardRotation(&s[18], &s[21], false);
+ HadamardRotation(&s[19], &s[20], false);
+ HadamardRotation(&s[24], &s[31], true);
+ HadamardRotation(&s[25], &s[30], true);
+ HadamardRotation(&s[26], &s[29], true);
+ HadamardRotation(&s[27], &s[28], true);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+ const bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[32], x[32];
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_8>(s);
+ Dct8Stages<ButterflyRotation_8>(s);
+ Dct16Stages<ButterflyRotation_8>(s);
+ Dct32Stages<ButterflyRotation_8>(s);
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 32>(dst, step, 0, s);
+ }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[64], x[32];
+
+ if (transpose) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false);
+ HadamardRotation(&s[34], &s[35], true);
+ HadamardRotation(&s[36], &s[37], false);
+ HadamardRotation(&s[38], &s[39], true);
+ HadamardRotation(&s[40], &s[41], false);
+ HadamardRotation(&s[42], &s[43], true);
+ HadamardRotation(&s[44], &s[45], false);
+ HadamardRotation(&s[46], &s[47], true);
+ HadamardRotation(&s[48], &s[49], false);
+ HadamardRotation(&s[50], &s[51], true);
+ HadamardRotation(&s[52], &s[53], false);
+ HadamardRotation(&s[54], &s[55], true);
+ HadamardRotation(&s[56], &s[57], false);
+ HadamardRotation(&s[58], &s[59], true);
+ HadamardRotation(&s[60], &s[61], false);
+ HadamardRotation(&s[62], &s[63], true);
+
+ // stage 7.
+ ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false);
+ HadamardRotation(&s[33], &s[34], false);
+ HadamardRotation(&s[36], &s[39], true);
+ HadamardRotation(&s[37], &s[38], true);
+ HadamardRotation(&s[40], &s[43], false);
+ HadamardRotation(&s[41], &s[42], false);
+ HadamardRotation(&s[44], &s[47], true);
+ HadamardRotation(&s[45], &s[46], true);
+ HadamardRotation(&s[48], &s[51], false);
+ HadamardRotation(&s[49], &s[50], false);
+ HadamardRotation(&s[52], &s[55], true);
+ HadamardRotation(&s[53], &s[54], true);
+ HadamardRotation(&s[56], &s[59], false);
+ HadamardRotation(&s[57], &s[58], false);
+ HadamardRotation(&s[60], &s[63], true);
+ HadamardRotation(&s[61], &s[62], true);
+
+ // stage 16.
+ ButterflyRotation_8(&s[61], &s[34], 56, true);
+ ButterflyRotation_8(&s[60], &s[35], 56, true);
+ ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false);
+ HadamardRotation(&s[33], &s[38], false);
+ HadamardRotation(&s[34], &s[37], false);
+ HadamardRotation(&s[35], &s[36], false);
+ HadamardRotation(&s[40], &s[47], true);
+ HadamardRotation(&s[41], &s[46], true);
+ HadamardRotation(&s[42], &s[45], true);
+ HadamardRotation(&s[43], &s[44], true);
+ HadamardRotation(&s[48], &s[55], false);
+ HadamardRotation(&s[49], &s[54], false);
+ HadamardRotation(&s[50], &s[53], false);
+ HadamardRotation(&s[51], &s[52], false);
+ HadamardRotation(&s[56], &s[63], true);
+ HadamardRotation(&s[57], &s[62], true);
+ HadamardRotation(&s[58], &s[61], true);
+ HadamardRotation(&s[59], &s[60], true);
+
+ // stage 25.
+ ButterflyRotation_8(&s[59], &s[36], 48, true);
+ ButterflyRotation_8(&s[58], &s[37], 48, true);
+ ButterflyRotation_8(&s[57], &s[38], 48, true);
+ ButterflyRotation_8(&s[56], &s[39], 48, true);
+ ButterflyRotation_8(&s[55], &s[40], 112, true);
+ ButterflyRotation_8(&s[54], &s[41], 112, true);
+ ButterflyRotation_8(&s[53], &s[42], 112, true);
+ ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false);
+ HadamardRotation(&s[33], &s[46], false);
+ HadamardRotation(&s[34], &s[45], false);
+ HadamardRotation(&s[35], &s[44], false);
+ HadamardRotation(&s[36], &s[43], false);
+ HadamardRotation(&s[37], &s[42], false);
+ HadamardRotation(&s[38], &s[41], false);
+ HadamardRotation(&s[39], &s[40], false);
+ HadamardRotation(&s[48], &s[63], true);
+ HadamardRotation(&s[49], &s[62], true);
+ HadamardRotation(&s[50], &s[61], true);
+ HadamardRotation(&s[51], &s[60], true);
+ HadamardRotation(&s[52], &s[59], true);
+ HadamardRotation(&s[53], &s[58], true);
+ HadamardRotation(&s[54], &s[57], true);
+ HadamardRotation(&s[55], &s[56], true);
+
+ // stage 30.
+ ButterflyRotation_8(&s[55], &s[40], 32, true);
+ ButterflyRotation_8(&s[54], &s[41], 32, true);
+ ButterflyRotation_8(&s[53], &s[42], 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 32, true);
+ ButterflyRotation_8(&s[50], &s[45], 32, true);
+ ButterflyRotation_8(&s[49], &s[46], 32, true);
+ ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+ }
+ //-- end dct 64 stages
+
+ if (transpose) {
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 64>(dst, step, 0, s);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+ const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+ const __m128i kAdst4Multiplier_m0_1 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+ (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+ const __m128i kAdst4Multiplier_3_0 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+ (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+ // stage 1.
+ const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+ const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+ const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+ const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+ const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+ s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+ s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+ // stage 2.
+ // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+ const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+ const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+ const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+ // stage 3.
+ s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+ s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+ s[2] = b7;
+ s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+ // stage 4.
+ s[0] = _mm_add_epi32(s[0], s[5]);
+ s[1] = _mm_sub_epi32(s[1], s[6]);
+
+ // stages 5 and 6.
+ x[0] = _mm_add_epi32(s[0], s[3]);
+ x[1] = _mm_add_epi32(s[1], s[3]);
+ x[2] = _mm_add_epi32(s[0], s[1]);
+ x[3] = _mm_sub_epi32(x[2], s[3]);
+
+ x[0] = RightShiftWithRounding_S32(x[0], 12);
+ x[1] = RightShiftWithRounding_S32(x[1], 12);
+ x[2] = RightShiftWithRounding_S32(s[2], 12);
+ x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+ x[0] = _mm_packs_epi32(x[0], x[1]);
+ x[2] = _mm_packs_epi32(x[2], x[3]);
+ x[1] = _mm_srli_si128(x[0], 8);
+ x[3] = _mm_srli_si128(x[2], 8);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(x, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ StoreDst<8, 4>(dst, step, 0, x);
+ }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+ 3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src =
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const __m128i v_kAdst4DcOnlyMultipliers =
+ LoadUnaligned16(kAdst4DcOnlyMultiplier);
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ // +
+ // s0*0 s0*0 s0*0 s0*k0
+ const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i c = _mm_packs_epi32(b, b);
+ StoreLo8(dst, c);
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+ const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+ const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+ const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+ const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+ const __m128i x0 = s0;
+ const __m128i x1 = s1;
+ const __m128i x2 = s2;
+ const __m128i x3 = _mm_add_epi32(s0, s1);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+ const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+ const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+ const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+ StoreLo8(&dst[i], dst_0_1);
+ StoreHi8(&dst[i + width * 1], dst_0_1);
+ StoreLo8(&dst[i + width * 2], dst_2_3);
+ StoreHi8(&dst[i + width * 3], dst_2_3);
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[3], &s[7], false);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(x, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false);
+ HadamardRotation(&s[1], &s[9], false);
+ HadamardRotation(&s[2], &s[10], false);
+ HadamardRotation(&s[3], &s[11], false);
+ HadamardRotation(&s[4], &s[12], false);
+ HadamardRotation(&s[5], &s[13], false);
+ HadamardRotation(&s[6], &s[14], false);
+ HadamardRotation(&s[7], &s[15], false);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[8], &s[12], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[9], &s[13], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[10], &s[14], false);
+ HadamardRotation(&s[3], &s[7], false);
+ HadamardRotation(&s[11], &s[15], false);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[8], &s[10], false);
+ HadamardRotation(&s[12], &s[14], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+ HadamardRotation(&s[9], &s[11], false);
+ HadamardRotation(&s[13], &s[15], false);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&x[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&x[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[16];
+ __m128i x[16];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 2; ++i) {
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 =
+ _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+ }
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ __m128i s[16];
+ __m128i x[16];
+ const __m128i v_src = LoadUnaligned16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ if (is_row_shift) {
+ const int shift = 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+ StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+ }
+ } else {
+ const __m128i v_multiplier =
+ _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i b = _mm_adds_epi16(a, v_src);
+ StoreUnaligned16(&dst[i * step], b);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+ const int shift = (tx_height < 16) ? 0 : 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult =
+ _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ Store4(dst, _mm_packus_epi16(c, c));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ StoreLo8(dst + j, _mm_packus_epi16(c, c));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+ StoreUnaligned16(&dst[h * step], v_src_mult);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i a = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step], a);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src =
+ _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+ const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ const __m128i v_src = LoadLo8(&source[row]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+ const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+ const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+ const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+ const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+ const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+ const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+ const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+ const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+ const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+ const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+ StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+ StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round0 =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i b = _mm_sra_epi32(a, v_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_multiplier =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int h = 0; h < 4; ++h) {
+ for (int i = 0; i < 32; i += 8) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step + i], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+ dst[0] = _mm_extract_epi16(v_dst_0, 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ const __m128i v_two = _mm_set1_epi16(2);
+
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+ const __m128i b = _mm_srai_epi16(a, 2);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+ const int start_x, const int start_y,
+ const void* LIBGAV1_RESTRICT source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int16_t*>(source);
+ __m128i s[4], x[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int16_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int16_t g = f >> 1;
+ f = f - (f >> 1);
+ const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int16_t i = (src[0] >> 4);
+ s[0] = _mm_set1_epi16(h);
+ s[0] = _mm_insert_epi16(s[0], f, 0);
+ s[1] = _mm_set1_epi16(i);
+ s[1] = _mm_insert_epi16(s[1], g, 0);
+ s[2] = s[3] = s[1];
+ } else {
+ x[0] = LoadLo8(&src[0 * 4]);
+ x[2] = LoadLo8(&src[1 * 4]);
+ x[3] = LoadLo8(&src[2 * 4]);
+ x[1] = LoadLo8(&src[3 * 4]);
+
+ // Row transforms.
+ Transpose4x4_U16(x, x);
+ s[0] = _mm_srai_epi16(x[0], 2);
+ s[2] = _mm_srai_epi16(x[1], 2);
+ s[3] = _mm_srai_epi16(x[2], 2);
+ s[1] = _mm_srai_epi16(x[3], 2);
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ __m128i e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ Transpose4x4_U16(s, s);
+
+ // Column transforms.
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ }
+
+ // Store to frame.
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ for (int row = 0; row < 4; ++row) {
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_cvtepu8_epi16(frame_data);
+ const __m128i b = _mm_add_epi16(a, s[row]);
+ Store4(dst, _mm_packus_epi16(b, b));
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source, TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const int stride = frame.columns();
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const __m128i residual = LoadLo8(&source[row]);
+ const __m128i frame_data = Load4(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i a = _mm_adds_epi16(residual, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ }
+ } else if (tx_width == 8) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+ const __m128i residual = LoadUnaligned16(&source[row]);
+ const __m128i frame_data = LoadLo8(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i e = _mm_adds_epi16(d, c);
+ StoreLo8(dst, _mm_packus_epi16(e, e));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const __m128i residual = LoadUnaligned16(&source[row + j]);
+ const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+ const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+ const __m128i e = _mm_adds_epi16(d, c);
+ const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+ StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+ j += 16;
+ } while (j < tx_width);
+ }
+ }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+ const __m128i word_reverse_8 =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // read 16 shorts
+ const __m128i v3210 = LoadUnaligned16(&source[i]);
+ const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+ const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+ const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+ StoreUnaligned16(&source[i], v4567);
+ StoreUnaligned16(&source[i + 8], v0123);
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+ StoreUnaligned16(&source[i], b);
+ }
+ } else {
+ const __m128i dual_word_reverse_4 =
+ _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+ StoreUnaligned16(&source[i], b);
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i], b);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+ int j = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i * tx_width + j], b);
+ j += 8;
+ } while (j < non_zero_width);
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+ int row_shift) {
+ const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i residual = LoadUnaligned16(&source[i]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i], shifted_residual);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ for (int j = 0; j < tx_width; j += 8) {
+ const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+ }
+ } while (++i < num_rows);
+ }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct4 columns in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+ /*transpose=*/false);
+ } else {
+ // Process 8 1d dct4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct8 columns in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct16 columns in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 columns in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+ i += 4;
+ } while (i < adjusted_tx_height);
+
+ if (row_shift != 0) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+ i += 4;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst8 columns in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d adst8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst16 columns in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 columns in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ } else {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = 0;
+ do {
+ Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = 0;
+ do {
+ Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = 0;
+ do {
+ Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+ kTransformRowShift[tx_size]);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = 0;
+ do {
+ Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/,
+ void* /*src_buffer*/, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Do both row and column transforms in the column-transform pass.
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int16_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ // Maximum transform size for Dct is 64.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize64_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
+#endif
+
+ // Maximum transform size for Adst is 16.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
+#endif
+
+ // Maximum transform size for Identity transform is 32.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
+#endif
+
+ // Maximum transform size for Wht is 4.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dWht)
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+ const __m128i& a2, const __m128i& s1,
+ const __m128i& s2) {
+ __m128i x = _mm_add_epi16(a1, total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+ return x;
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+ const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+ return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+ const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+ const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+ const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+ return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+ return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+ return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+ const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+ const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+ const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+ const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+ const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+ __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_and_si128(a, mask);
+ a = _mm_unpacklo_epi32(a, a);
+
+ const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+ const __m128i a1a2 = AddShift3(a, t4t3);
+ const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+ const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+ // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+ const __m128i adjust_sign_for_add =
+ _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+ const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+ const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+ const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+ const __m128i c = _mm_xor_si128(b, t80);
+
+ *oqp0 = c;
+ *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // output
+ // d0 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ *d0 = _mm_unpacklo_epi16(w0, w1);
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(*d0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = Load4(dst - 2 + 0 * stride);
+ __m128i x1 = Load4(dst - 2 + 1 * stride);
+ __m128i x2 = Load4(dst - 2 + 2 * stride);
+ __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+ const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+ const __m128i qp0 = _mm_srli_si128(d0, 4);
+ const __m128i q1q0 = _mm_srli_si128(d0, 8);
+ const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i p1 = oqp1;
+ const __m128i p0 = oqp0;
+ const __m128i q0 = _mm_srli_si128(oqp0, 4);
+ const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0, 12);
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 4);
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1, 8);
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d0 = _mm_unpacklo_epi32(w4, w5);
+ *d1 = _mm_srli_si128(*d0, 8);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d2 = _mm_unpackhi_epi32(w4, w5);
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 4);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreLo8(dst - 4 + 0 * stride, x0);
+ StoreLo8(dst - 4 + 1 * stride, x1);
+ StoreLo8(dst - 4 + 2 * stride, x2);
+ StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+ const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+ const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+ *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+ *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+ *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ const __m128i p6 = Load4(dst - 7 * stride);
+ const __m128i p5 = Load4(dst - 6 * stride);
+ const __m128i p4 = Load4(dst - 5 * stride);
+ const __m128i q4 = Load4(dst + 4 * stride);
+ const __m128i q5 = Load4(dst + 5 * stride);
+ const __m128i q6 = Load4(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ Store4(dst - 6 * stride, oqp5_f14);
+ Store4(dst - 5 * stride, oqp4_f14);
+ Store4(dst - 4 * stride, oqp3_f14);
+ Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+ Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+ Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+ }
+
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+ __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+ __m128i* q6p6, __m128i* q7p7) {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+ const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+ // 28 38 29 39 2a 3a 2b 3b 2c 3c 2d 3d 2e 3e 2f 3f
+ const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+ // 08 18 28 38 09 19 29 39 0a 1a 2a 3a 0b 1b 2b 3b
+ const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+ // 0c 1c 2c 3c 0d 1d 2d 3d 0e 1e 2e 3e 0f 1f 2f 3f
+ const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+ // 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+ // 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+ // 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+ // 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+ // 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+ // 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+ // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+ // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+ const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ __m128i* x0, __m128i* x1, __m128i* x2,
+ __m128i* x3) {
+ // qp7: 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ // qp6: 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ // qp5: 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ // qp4: 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ // qp3: 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ // qp2: 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ // qp1: 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ // qp0: 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+
+ // 00 01 10 11 20 21 30 31 0f 0e 1f 1e 2f 2e 3f 3e
+ const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+ // 02 03 12 13 22 23 32 33 xx xx xx xx xx xx xx xx
+ const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+ // 04 05 14 15 24 25 34 35 xx xx xx xx xx xx xx xx
+ const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+ // 06 07 16 17 26 27 36 37 xx xx xx xx xx xx xx xx
+ const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+ // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+ const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+ // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+ const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+ // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+ const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+ // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+ const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+ // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+ const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+ // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+ const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+ // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+ const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+ // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+ const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ *x0 = _mm_unpacklo_epi64(d0, d1);
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ *x1 = _mm_unpackhi_epi64(d0, d1);
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ *x2 = _mm_unpacklo_epi64(d2, d3);
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+ *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+ DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+ &qp6, &qp7);
+
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+ const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+ &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+ LoopFilterFuncs_SSE4_1() = delete;
+
+ static constexpr int kThreshShift = bitdepth - 8;
+
+ static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+ const __m128i& val) {
+ const __m128i a = _mm_min_epi16(val, max);
+ const __m128i b = _mm_max_epi16(a, min);
+ return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+ const __m128i& vmin, const __m128i& vmax) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i d = Clamp(vmin, vmax, c);
+ const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+ return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+ return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+ return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+ const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+ return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_abs_qp1mqp =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+ int bitdepth) {
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+ const __m128i vmax = _mm_subs_epi16(t80, t1);
+ const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+ const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+ const __m128i qs0 = _mm_srli_si128(ps0, 8);
+ const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+ __m128i a = _mm_subs_epi16(ps1, qs1);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+ const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+ const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+ const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+ const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+ const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+ const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+ const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+ __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+ __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+ oqps1 = Clamp(vmin, vmax, oqps1);
+ oqps0 = Clamp(vmin, vmax, oqps0);
+
+ *oqp1 = _mm_adds_epi16(oqps1, t80);
+ *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+ const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+ const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+ const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+ const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 01 11 21 31 p0p1
+ const __m128i a = _mm_unpacklo_epi32(w0, w1);
+ const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+ // 02 12 22 32 03 13 23 33 q1q0
+ const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+ const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo;
+ f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // output
+ // 00 10 20 30 xx xx xx xx
+ // 01 11 21 31 xx xx xx xx
+ // 02 12 22 32 xx xx xx xx
+ // 03 13 23 33 xx xx xx xx
+ // 04 14 24 34 xx xx xx xx
+ // 05 15 25 35 xx xx xx xx
+ // 06 16 26 36 xx xx xx xx
+ // 07 17 27 37 xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 04 14 05 15 06 16 07 17
+ const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+ // 24 34 25 35 26 36 27 37
+ const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+ // 04 14 24 34 05 15 25 35
+ const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+ // 06 16 26 36 07 17 27 37
+ const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 8);
+ // 02 12 22 32 xx xx xx xx
+ *d2 = ww2;
+ // 03 13 23 33 xx xx xx xx
+ *d3 = _mm_srli_si128(ww2, 8);
+ // 04 14 24 34 xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 8);
+ // 06 16 26 36 xx xx xx xx
+ *d6 = ww3;
+ // 07 17 27 37 xx xx xx xx
+ *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 8);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 8);
+ q1 = _mm_srli_si128(oqp1, 8);
+
+ TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreUnaligned16(dst - 4 + 0 * stride, x0);
+ StoreUnaligned16(dst - 4 + 1 * stride, x1);
+ StoreUnaligned16(dst - 4 + 2 * stride, x2);
+ StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = qp6;
+ const __m128i qp5_lo = qp5;
+ const __m128i qp4_lo = qp4;
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ const __m128i p6 = LoadLo8(dst - 7 * stride);
+ const __m128i p5 = LoadLo8(dst - 6 * stride);
+ const __m128i p4 = LoadLo8(dst - 5 * stride);
+ const __m128i q4 = LoadLo8(dst + 4 * stride);
+ const __m128i q5 = LoadLo8(dst + 5 * stride);
+ const __m128i q6 = LoadLo8(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ StoreLo8(dst - 6 * stride, oqp5_f14);
+ StoreLo8(dst - 5 * stride, oqp4_f14);
+ StoreLo8(dst - 4 * stride, oqp3_f14);
+
+ StoreHi8(dst + 3 * stride, oqp3_f14);
+ StoreHi8(dst + 4 * stride, oqp4_f14);
+ StoreHi8(dst + 5 * stride, oqp5_f14);
+ }
+
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx
+ // x4 40 41 42 43 xx xx xx xx
+ // x5 50 51 52 53 xx xx xx xx
+ // x6 60 61 62 63 xx xx xx xx
+ // x7 70 71 72 73 xx xx xx xx
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+ __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+ x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+ x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+ x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+ x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+ __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+ __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+ __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+ TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#endif
+}
+#endif
+} // namespace
+} // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m256i sum0 = _mm256_add_epi32(s[0], round);
+ const __m256i sum1 = _mm256_add_epi32(s[1], round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum1 =
+ _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+ const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+ const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+ const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+ const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+ __m256i madds[4];
+ madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+ const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+ const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+ const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+ const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+ const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+ madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+ filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[7];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ s[5] = LoadUnaligned32(src + x + 5);
+ s[6] = LoadUnaligned32(src + x + 6);
+ WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const __m256i filter =
+ _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[5];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[3];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s0 = LoadUnaligned32(src + x);
+ const __m256i d0 = _mm256_slli_epi16(s0, 4);
+ StoreAligned32(*wiener_buffer + x, d0);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+ const __m256i sum = _mm256_add_epi32(madd01, madd23);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i sum = _mm256_add_epi32(madd01, madd2);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+ const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+ return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[4], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm256_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm256_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[3]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[3], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[3] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[2] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m256i filter[2];
+ filter[0] =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+ StoreUnaligned32(dst + x, d[0][0]);
+ StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m256i a = LoadAligned32(wiener_buffer);
+ const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+ const __m256i c = _mm256_srai_epi16(b, 4);
+ const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+ const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+ // The next 4 lines are much faster than:
+ // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+ // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+ StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+ StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+ return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+ dst[0] = _mm256_madd_epi16(s0, s0);
+ dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3, __m256i* const row5) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+ SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass1_128 - sizeof(*src) * width;
+ const ptrdiff_t overread_in_bytes_256 =
+ kOverreadInBytesPass1_256 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+ __m256i sq[8];
+ s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128 + 0);
+ Square(s0[1], sq_128 + 2);
+ SumHorizontal16(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int overread_in_bytes_128, overread_in_bytes_256;
+ if (size == 3) {
+ overread_in_bytes_128 = kOverreadInBytesPass2_128;
+ overread_in_bytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ overread_in_bytes_128 = kOverreadInBytesPass1_128;
+ overread_in_bytes_256 = kOverreadInBytesPass1_256;
+ }
+ overread_in_bytes_128 -= sizeof(*src) * width;
+ overread_in_bytes_256 -= sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s_128[2], ss, sq_128[4], sqs[2];
+ __m256i sq[8];
+ s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+ s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s_128[0], sq_128 + 0);
+ Square(s_128[1], sq_128 + 2);
+ if (size == 3) {
+ ss = Sum3Horizontal16(s_128);
+ Sum3Horizontal32(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal16(s_128);
+ Sum5Horizontal32(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row[2], row_sq[4];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 4, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 4, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i b = VrshrU16(sum, 2);
+ const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]); // 0 2 1 3
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x63); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ __m256i sums[2];
+ sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+ sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+ if (n == 9) {
+ CalculateB3(sums[0], maq0, b0);
+ CalculateB3(sums[1], maq1, b1);
+ } else {
+ CalculateB5(sums[0], maq0, b0);
+ CalculateB5(sums[1], maq1, b1);
+ }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+ StoreAligned64(b444 + x, sum_b444);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi,
+ __m256i* const sum_ma444_lo,
+ __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+ __m256i sum_b444_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_mat343[2], sum_mat444[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+ *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+ StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+ StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ StoreAligned64_ma(ma444 + x, sum_ma444);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+ // Input:
+ // 0 1 2 3 // b[0]
+ // 4 5 6 7 // b[1]
+ // 8 9 10 11 24 25 26 27 // t[0]
+ // 12 13 14 15 28 29 30 31 // t[1]
+ // 16 17 18 19 32 33 34 35 // t[2]
+ // 20 21 22 23 36 37 38 39 // t[3]
+
+ // Output:
+ // 0 1 2 3 8 9 10 11 // b[0]
+ // 4 5 6 7 12 13 14 15 // b[1]
+ // 8 9 10 11 16 17 18 19 // b[2]
+ // 16 17 18 19 24 25 26 27 // b[3]
+ // 20 21 22 23 28 29 30 31 // b[4]
+ // 24 25 26 27 32 33 34 35 // b[5]
+ // 20 21 22 23 36 37 38 39 // b[6]
+ b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+ b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+ b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+ b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+ b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+ b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+ b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+ __m256i b[3]) {
+ __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+ s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+ s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+ __m256i ma[3], __m256i b[7]) {
+ __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s[0], sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+ s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[1], sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+ __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+ __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+ __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+ index_3[2][2], sum_5[2], index_5[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+ PermuteB(t, b3[0]);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+ PermuteB(t, b3[1]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+ PermuteB(t, b3);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[10];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 3, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src) * width;
+ __m128i s[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(
+ src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64_ma(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 3, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+ b5_128[10];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_128[0], b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64_ma(ma343[0] + x, ma);
+ Sum343(b3[0], b);
+ Sum343(b3[0] + 3, b + 2);
+ StoreAligned64(b343[0] + x, b);
+ StoreAligned64(b343[0] + x + 16, b + 2);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+ b343[1], b444);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(b5, b);
+ StoreAligned64(b565, b);
+ Sum565(b5 + 3, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+ const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+ const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[1]);
+ StoreAligned32(ma565[1] + x + 16, ma[3]);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ StoreAligned64(b565[1] + x, b[1]);
+ StoreAligned64(b565[1] + x + 16, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+ const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ ClipAndStore(dst + stride + x + 16, d11);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma0[2], sq_128[8], b0[6];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+ b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0[0], ma0[0]);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ LoadAligned64(b565 + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma565 + x + 16);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ LoadAligned64(b565 + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+ __m128i s0[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(
+ src0 + x + 8,
+ kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+ ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+ &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+ ma444[1], b343[2], b444[1]);
+ Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+ ma343[3], ma444[2], b343[3], b444[2]);
+
+ ma[0][2] = Sum565Lo(ma5x);
+ ma[0][3] = Sum565Hi(ma5x);
+ ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+ ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+ StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+ Sum565(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ // Keeping the following 4 redundant lines is faster. The reason is that
+ // there are not enough registers available, and these values could be saved
+ // and loaded which is even slower.
+ ma[1][2] = LoadAligned32(ma343[2] + x); // Redundant line 1.
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ ma[2][1] = LoadAligned32(ma444[1] + x); // Redundant line 2.
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ClipAndStore(dst + x, d00);
+ const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+ ClipAndStore(dst + stride + x, d10x);
+
+ Sum565(b5 + 3, bt[0][1]);
+ StoreAligned64(b565[1] + x + 16, bt[0][1]);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, bt[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ mat[1][2] = LoadAligned32(ma343[2] + x + 16); // Redundant line 3.
+ LoadAligned64(b343[0] + x + 16, bt[1][0]);
+ LoadAligned64(b444[0] + x + 16, bt[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ mat[2][1] = LoadAligned32(ma444[1] + x + 16); // Redundant line 4.
+ LoadAligned64(b343[1] + x + 16, bt[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 16, d01);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 16, d11);
+
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+ __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+ BoxFilterPreProcessLastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+ b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[2] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ mat[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 3, bt[1]);
+ ma[3] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 3, bt[2]);
+
+ const __m256i sr_lo = LoadUnaligned32(src + x);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+ mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+ LoadAligned64(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, bt[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+ LoadAligned64(b343 + x + 16, bt[0]);
+ LoadAligned64(b444 + x + 16, bt[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[5];
+ b3[1] = b3[6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m128i sum0 = _mm_add_epi32(s[0], round);
+ const __m128i sum1 = _mm_add_epi32(s[1], round);
+ const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+ filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], madds[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+ const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+ const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+ const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+ const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+ const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+ madds[0] = _mm_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i filter =
+ _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+ const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+ const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+ const __m128i s0m = _mm_sub_epi16(s04, s2d);
+ const __m128i s1m = _mm_sub_epi16(s13, s2d);
+ const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+ const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+ const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+ madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+ const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i d = _mm_slli_epi16(s, 4);
+ StoreAligned16(*wiener_buffer + x, d);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+ const __m128i sum = _mm_add_epi32(madd01, madd23);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i sum = _mm_add_epi32(madd01, madd2);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+ const __m128i d = _mm_packus_epi32(s[0], s[1]);
+ return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[4], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[3]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[3], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[3], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[3] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[8], d[2];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[2] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[6], d[2];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m128i filter[2];
+ filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[4], d[2];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m128i a = LoadAligned16(wiener_buffer);
+ const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+ const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+ StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i coefficients_horizontal =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ SumHorizontal16(src + 0, row3_0, row5_0);
+ SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(s + 0);
+ row[1] = Sum3Horizontal16(s + 1);
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 2, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(s + 0);
+ row[1] = Sum5Horizontal16(s + 1);
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 2, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+ // equivalent if not faster than pinsrb.
+ if (offset == 0) {
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ }
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[4]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+ StoreAligned32U32(b444 + x, sum_b444);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ s5[0][3] = Sum5Horizontal16(s[0] + 1);
+ s5[1][3] = Sum5Horizontal16(s[0] + 2);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ s5[0][4] = Sum5Horizontal16(s[1] + 1);
+ s5[1][4] = Sum5Horizontal16(s[1] + 2);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5Horizontal32(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[2], sq + 4);
+ s5[0][3] = Sum5Horizontal16(s + 1);
+ s5[1][3] = Sum5Horizontal16(s + 2);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[3], sq + 6);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ s3[2] = Sum3Horizontal16(s + 1);
+ s3[3] = Sum3Horizontal16(s + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src + x + 16,
+ overread_in_bytes + sizeof(*src) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src + x + 24,
+ overread_in_bytes + sizeof(*src) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 2, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343(b3[0] + 0, b + 0);
+ Sum343(b3[0] + 2, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565(b5 + 0, b + 0);
+ Sum565(b5 + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+ const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+ const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+ StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2], p[2];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ const __m128i sr0_lo = LoadAligned16(src + x + 0);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565(bs + 2, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565(bs + 2, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0_lo = LoadAligned16(src + x);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x);
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565(b5 + 2, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], b3, b5);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 2, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 2, b[2]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[4];
+ b3[1] = b3[5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+ // The sum range here is [-128 * 255, 90 * 255].
+ const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+ const __m256i sum = _mm256_add_epi16(madd, round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+ const __m256i filter[4],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+ __m256i madds[4];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+ const __m256i filter[3],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ __m256i madds[3];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+ kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ __m256i madds[2];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+ filter[3] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+ filter[2] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[1] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s = LoadUnaligned32(src + x);
+ const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+ __m256i d[2];
+ d[0] = _mm256_slli_epi16(s0, 4);
+ d[1] = _mm256_slli_epi16(s1, 4);
+ StoreAligned64(*wiener_buffer + x, d);
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum0 = _mm256_add_epi32(round, madd0);
+ const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+ return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd = _mm256_madd_epi16(a, filter);
+ const __m256i sum = _mm256_add_epi32(round, madd);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ __m256i b[2];
+ const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+ const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+ const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+ b[0] = _mm256_unpacklo_epi16(a06, a15);
+ b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+ const __m256i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a06, a15);
+ b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+ const __m256i sum1 = WienerVertical7(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2];
+ const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+ const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+ b[0] = _mm256_unpacklo_epi16(a04, a13);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ const __m256i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a04, a13);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ const __m256i sum1 = WienerVertical5(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+ __m256i b;
+ const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+ b = _mm256_unpacklo_epi16(a02, a[1]);
+ const __m256i sum0 = WienerVertical3(b, filter);
+ b = _mm256_unpackhi_epi16(a02, a[1]);
+ const __m256i sum1 = WienerVertical3(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i filter =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+ const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+ const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+ const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+ const __m256i c0 = _mm256_srai_epi16(b0, 4);
+ const __m256i c1 = _mm256_srai_epi16(b1, 4);
+ const __m256i d = _mm256_packus_epi16(c0, c1);
+ StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 32;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 32);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ __m128i c_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+ const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+ const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+ const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+ const __m256i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+ const __m256i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlLo8(src[0], src[1]);
+ const __m256i sum23 = VaddlLo8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlHi8(src[0], src[1]);
+ const __m256i sum23 = VaddlHi8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const dst0, __m256i* const dst1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+ __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s0);
+ sq_128[1] = SquareHi8(s0);
+ SumHorizontalLo(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[2];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int kOverreadInBytes_128, kOverreadInBytes_256;
+ if (size == 3) {
+ kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+ }
+ int y = 2;
+ do {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+ __m128i ss, sq_128[2], sqs[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s);
+ sq_128[1] = SquareHi8(s);
+ if (size == 3) {
+ ss = Sum3Horizontal(s);
+ Sum3WHorizontal(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal(s);
+ Sum5WHorizontal(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row[2], row_sq[4];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ if (size == 3) {
+ Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[2];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x93); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ if (n == 9) {
+ b[0] = CalculateB3(sum[0], maq0);
+ b[1] = CalculateB3(sum[1], maq1);
+ } else {
+ b[0] = CalculateB5(sum[0], maq0);
+ b[1] = CalculateB5(sum[1], maq1);
+ }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ StoreAligned64(b444 + x, sum_b444);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+ __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+ __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[4], sq3[3][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+ __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0]);
+ sq[1][1] = SquareHi8(s[1]);
+ SumHorizontalLo(s[0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ // Note: in the SSE4_1 version, CalculateIntermediate() is called
+ // to replace the slow LookupIntermediate() when calculating 16 intermediate
+ // data points. However, the AVX2 compiler generates even slower code. So we
+ // keep using CalculateIntermediate3().
+ CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+ CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+ __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+ sum_5[2], index_5[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+ b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2];
+ sq[1] = SquareLo8(s0);
+ sq[2] = SquareHi8(s0);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, sq_128[2], b0;
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64(ma343[0] + x, ma);
+ Sum343W(b3[0], b);
+ StoreAligned64(b343[0] + x, b);
+ Sum565W(b5, b);
+ StoreAligned64(b565, b);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64(ma565, ma);
+ Sum343W(b3[0] + 1, b);
+ StoreAligned64(b343[0] + x + 16, b);
+ Sum565W(b5 + 1, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2][2];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ StoreAligned64(ma565[1] + x, ma + 1);
+ Sum565W(bs + 0, b[0][1]);
+ Sum565W(bs + 1, b[1][1]);
+ StoreAligned64(b565[1] + x + 0, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[1][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+ const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+ const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ ma[1] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[1][0]);
+ const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+ const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+ &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ Sum565W(bs + 0, b[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565);
+ LoadAligned64(b565 + 0, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma565 + 16);
+ LoadAligned64(b565 + 16, b[0]);
+ Sum565W(bs + 1, b[1]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ ma[0][2] = Sum565Hi(ma5x);
+ mat[0][1] = ma[0][2];
+ StoreAligned64(ma565[1] + x, ma[0] + 1);
+ Sum565W(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[0][1]);
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1][0]);
+ LoadAligned64(b444[0] + x + 16, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ LoadAligned64(b343[1] + x + 16, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+ __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_0, b3_0);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+ BoxFilterPreProcessLastRow(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width,
+ sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565 + x);
+ LoadAligned64(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ mat[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ LoadAligned64(b343 + x + 16, b[0]);
+ LoadAligned64(b444 + x + 16, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+ const __m128i sum = _mm_add_epi16(s[0], s[1]);
+ const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+ const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+ const __m128i filter[4],
+ int16_t* const wiener_buffer) {
+ __m128i madds[4];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm_add_epi16(madds[1], madds[3]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+ const __m128i filter[3],
+ int16_t* const wiener_buffer) {
+ __m128i madds[3];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ const __m128i s_3x128 =
+ _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+ const __m128i filter[2],
+ int16_t* const wiener_buffer) {
+ __m128i madds[2];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient0,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+ filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], ss[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ ss[3] = _mm_unpackhi_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient1,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+ filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], ss[3];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient2,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], ss[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+ const __m128i d0 = _mm_slli_epi16(s0, 4);
+ const __m128i d1 = _mm_slli_epi16(s1, 4);
+ StoreAligned16(*wiener_buffer + x + 0, d0);
+ StoreAligned16(*wiener_buffer + x + 8, d1);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum0 = _mm_add_epi32(round, madd0);
+ const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+ return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd = _mm_madd_epi16(a, filter);
+ const __m128i sum = _mm_add_epi32(round, madd);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ __m128i b[2];
+ const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+ const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+ const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+ b[0] = _mm_unpacklo_epi16(a06, a15);
+ b[1] = _mm_unpacklo_epi16(a24, a[3]);
+ const __m128i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a06, a15);
+ b[1] = _mm_unpackhi_epi16(a24, a[3]);
+ const __m128i sum1 = WienerVertical7(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2];
+ const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+ const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+ b[0] = _mm_unpacklo_epi16(a04, a13);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ const __m128i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a04, a13);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ const __m128i sum1 = WienerVertical5(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+ __m128i b;
+ const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+ b = _mm_unpacklo_epi16(a02, a[1]);
+ const __m128i sum0 = WienerVertical3(b, filter);
+ b = _mm_unpackhi_epi16(a02, a[1]);
+ const __m128i sum1 = WienerVertical3(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i d[2]) {
+ __m128i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = Load4(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i filter =
+ _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+ const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+ const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+ const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+ const __m128i c0 = _mm_srai_epi16(b0, 4);
+ const __m128i c1 = _mm_srai_epi16(b1, 4);
+ const __m128i d = _mm_packus_epi16(c0, c1);
+ StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const __m128i c = LoadLo8(filter_horizontal);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ const __m128i coefficients_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[0],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlHi8(src[0], src[1]);
+ const __m128i sum23 = VaddlHi8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+ __m128i* const dst1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src,
+ sum_width - x + kOverreadInBytesPass1 - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ constexpr int kOverreadInBytes =
+ (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ if (size == 3) {
+ Sum3Horizontal<0>(s, row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal<0>(s, &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ // offset == 0 is assumed to be the first call to this function. The value is
+ // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
+ // equivalent if not faster than pinsrb.
+ if (offset == 0) {
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ } else {
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ }
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i* const b0,
+ __m128i* const b1) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b0 = CalculateB3(sum[0], maq0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ *b1 = CalculateB3(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[2]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ StoreAligned32U32(b444 + x, sum_b444);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ Sum5WHorizontal(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5WHorizontal(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[2] = SquareLo8(s[1]);
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum5WHorizontal(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ Sum3Horizontal<8>(s, s3 + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum3WHorizontal(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343W(b3[0] + 0, b + 0);
+ Sum343W(b3[0] + 1, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565W(b5 + 0, b + 0);
+ Sum565W(b5 + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565W(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ sr[0] = LoadAligned16(src + x);
+ sr[1] = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565W(bs + 1, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565W(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565W(bs + 1, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc.
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565W(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0 = LoadAligned16(src + x);
+ const __m128i sr1 = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+ square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 = LoadUnaligned16(mask);
+ const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_val_01 = LoadUnaligned16(mask);
+ // Stride is fixed because this is the smallest block size.
+ const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+ // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+ const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+ const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ return GetMask4x2<subsampling_x, subsampling_y>(mask);
+ }
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val_0 = Load4(mask);
+ const __m128i mask_val_1 = Load4(mask + mask_stride);
+ return _mm_cvtepu8_epi16(
+ _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+ if (subsampling_y == 1) {
+ const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+ const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+ const int16_t* LIBGAV1_RESTRICT const pred_1,
+ const __m128i pred_mask_0,
+ const __m128i pred_mask_1,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadAligned16(pred_0);
+ const __m128i pred_val_1 = LoadAligned16(pred_1);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+ const __m128i compound_pred = _mm_packus_epi32(
+ _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+ const __m128i res = _mm_packus_epi16(result, result);
+ Store4(dst, res);
+ Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ assert(subsampling_x == 1);
+ const uint8_t* mask = mask_ptr;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
+ if (height == 4) {
+ MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+ dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = width;
+ if (width == 4) {
+ MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, height, dst, dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+ const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+ _mm_srli_epi32(compound_pred_hi, 6));
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(res, 4);
+ StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+ const uint8_t* LIBGAV1_RESTRICT const pred_0,
+ uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+ const __m128i pred_mask_0, const __m128i pred_mask_1) {
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(pred_0);
+ __m128i pred_val_1 = Load4(pred_1);
+ pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+ pred_val_1);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ Store4(pred_1, res);
+ Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride) {
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i pred_mask_u16_first =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ const __m128i pred_mask_u16_second =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ __m128i pred_mask_1 =
+ _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+ __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+
+ pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+ pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ return;
+ }
+ int y = 0;
+ do {
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+ y += 8;
+ } while (y < height);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+ return _mm_packus_epi16(ret, ret);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ // Unfortunately there is no shift operation for 8-bit packing, or else we
+ // could return everything with 8-bit packing.
+ const __m128i mask_val = LoadLo8(mask);
+ return mask_val;
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4_1(
+ const uint8_t* LIBGAV1_RESTRICT prediction_0,
+ uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height) {
+ if (width == 4) {
+ InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_1 =
+ GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+ const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ StoreLo8(prediction_1 + x, res);
+
+ x += 8;
+ } while (x < width);
+ prediction_0 += width;
+ prediction_1 += prediction_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
+#endif
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+ const __m128i shift) {
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+ const __m128i mask_val_3 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+ const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+ const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+ const __m128i subsampled_mask =
+ _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+ return RightShiftWithRounding_U16(subsampled_mask, 2);
+ }
+ if (subsampling_x == 1) {
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8(mask));
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const __m128i& pred_mask_0, const __m128i& pred_mask_1,
+ const __m128i& offset, const __m128i& max, const __m128i& shift4,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+ const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+ // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+ const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+ // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride,
+ uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height, uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(
+ const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ const __m128i compound_pred_lo_0 =
+ _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 =
+ _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 =
+ _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 =
+ _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result =
+ _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreUnaligned16(dst + x, result);
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+ const __m128i& pred_mask_1, const __m128i& shift6,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+ const __m128i pred_val_1 =
+ LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+ StoreLo8(dst, res);
+ StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ __m128i pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height, uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ int y = height;
+ do {
+ __m128i pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+ const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+ dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+ dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+ dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+ const __m128i reference_offset) {
+ const __m128i kOne = _mm_set1_epi16(0x0100);
+ const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+ const __m128i tt = _mm_unpacklo_epi8(t, t);
+ const __m128i idx = _mm_add_epi8(tt, kOne);
+ return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+ const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+ const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+ const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+ const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+ const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+ const __m128i projection = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+ const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+ const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+ const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+ const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+ return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+ const __m128i division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const __m128i& r_offsets, const __m128i& source_reference_type8,
+ const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+ const __m128i& d_sign, const int delta, __m128i* const r,
+ __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+ const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+ __m128i projection_mv[2];
+ mvs[0] = LoadUnaligned16(mv_int + 0);
+ mvs[1] = LoadUnaligned16(mv_int + 4);
+ // Deinterlace x and y components
+ const __m128i kShuffle =
+ _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+ const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+ const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+ const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+ const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+ const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+ const __m128i positions = _mm_packs_epi16(position_x, position_y);
+ const __m128i k01234567 =
+ _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+ *position_xy = _mm_add_epi8(positions, k01234567);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling =
+ std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+ 1; // [-1, 15]
+ const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+ const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+ const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+ const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+ const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+ const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+ const __m128i out = _mm_or_si128(underflow, overflow);
+ const __m128i skip_low = _mm_or_si128(skip_r, out);
+ const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+ StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+ const __m128i mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset =
+ static_cast<int16_t>(_mm_extract_epi16(position, idx));
+ if ((idx & 3) == 0) {
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
+ } else {
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
+ }
+ dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+ const __m128i reference_offset, const __m128i mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+ const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign, const int dst_sign,
+ const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const __m128i skip_reference = LoadLo8(skip_references);
+ const __m128i r_offsets = LoadLo8(reference_offsets);
+ const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7]
+ const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+ const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+ mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_xy, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const __m128i p_y =
+ _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset =
+ _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+} // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const __m128i numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+ const __m128i denominators[2],
+ const __m128i numerator) {
+ const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+ const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+ const __m128i mv = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t temporal_reference_offsets[2],
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadLo8(tmvs);
+ const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+ mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+ denominators[0] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+ denominators[1] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+ const __m128i offsets = LoadLo8(reference_offsets);
+ const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+ const int reference_offset) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadAligned16(tmvs);
+ __m128i lookup = _mm_cvtsi32_si128(
+ kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+ 1);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+ 2);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+ 3);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+ mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+ denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+ denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+ const __m128i numerator = _mm_set1_epi32(reference_offset);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+ const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+ StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+ const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+ const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+ StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 4;
+ } while (i < count);
+}
+
+} // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 2;
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+ const __m128i obmc_pred_val = Load4(obmc_pred);
+
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store2(pred, packed_result);
+ pred += prediction_stride;
+ const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ // Duplicate first half of vector.
+ const __m128i masks =
+ _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+ int y = height;
+ do {
+ const __m128i pred_val0 = Load4(pred);
+ pred += prediction_stride;
+
+ // Place the second row of each source in the second four bytes.
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ const int second_row_result = _mm_extract_epi32(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i mask_val = LoadLo8(kObmcMask + 6);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
+ pred += prediction_stride;
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 2);
+ assert(height >= 4);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint8_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadUnaligned16(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+ int y = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+ StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < height);
+ x += 16;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(
+ _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+ mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i pred_val0 = Load4(pred);
+
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ pred += prediction_stride;
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ Store4(pred, _mm_srli_si128(packed_result, 4));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y += 2;
+ } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8_t* mask = kObmcMask + height - 2;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ int y = compute_height;
+ do {
+ const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+ const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+ --y;
+ const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+ const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
+ pred += prediction_stride;
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (--y > 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 4);
+ assert(height >= 2);
+
+ if (width == 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
+ return;
+ }
+
+ // Stop when mask value becomes 64.
+ const int compute_height = height - (height >> 2);
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ const uint8_t* mask = kObmcMask + height - 2;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+ StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+ x += 16;
+ } while (x < width);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < compute_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 2;
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i result = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result, result);
+ Store4(pred, packed_result);
+ Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+ assert(width >= 2);
+ assert(height >= 4);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint16_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadLo8(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y += 2;
+ } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+ assert(width >= 4);
+ assert(height >= 2);
+
+ if (width == 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
+ return;
+ }
+
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ const uint8_t* mask = kObmcMask + height - 2;
+ pred = static_cast<uint16_t*>(prediction);
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ int y = 0;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+ x += 8;
+ } while (x < width);
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (++y < compute_height);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+ kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, -128, 0, 0, 0, 0}, {0, 0, 1, -128, -2, 1, 0, 0},
+ {0, -1, 3, -127, -4, 2, -1, 0}, {0, -1, 4, -127, -6, 3, -1, 0},
+ {0, -2, 6, -126, -8, 3, -1, 0}, {0, -2, 7, -125, -11, 4, -1, 0},
+ {1, -2, 8, -125, -13, 5, -2, 0}, {1, -3, 9, -124, -15, 6, -2, 0},
+ {1, -3, 10, -123, -18, 6, -2, 1}, {1, -3, 11, -122, -20, 7, -3, 1},
+ {1, -4, 12, -121, -22, 8, -3, 1}, {1, -4, 13, -120, -25, 9, -3, 1},
+ {1, -4, 14, -118, -28, 9, -3, 1}, {1, -4, 15, -117, -30, 10, -4, 1},
+ {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+ {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+ {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+ {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+ {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+ {1, -6, 20, -97, -58, 17, -6, 1}, {1, -6, 20, -95, -61, 18, -6, 1},
+ {2, -7, 20, -93, -64, 18, -6, 2}, {2, -7, 20, -91, -66, 19, -6, 1},
+ {2, -7, 20, -88, -69, 19, -6, 1}, {2, -7, 20, -86, -71, 19, -6, 1},
+ {2, -7, 20, -84, -74, 20, -7, 2}, {2, -7, 20, -81, -76, 20, -7, 1},
+ {2, -7, 20, -79, -79, 20, -7, 2}, {1, -7, 20, -76, -81, 20, -7, 2},
+ {2, -7, 20, -74, -84, 20, -7, 2}, {1, -6, 19, -71, -86, 20, -7, 2},
+ {1, -6, 19, -69, -88, 20, -7, 2}, {1, -6, 19, -66, -91, 20, -7, 2},
+ {2, -6, 18, -64, -93, 20, -7, 2}, {1, -6, 18, -61, -95, 20, -6, 1},
+ {1, -6, 17, -58, -97, 20, -6, 1}, {1, -6, 17, -56, -99, 20, -6, 1},
+ {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+ {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+ {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+ {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+ {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+ {1, -3, 9, -28, -118, 14, -4, 1}, {1, -3, 9, -25, -120, 13, -4, 1},
+ {1, -3, 8, -22, -121, 12, -4, 1}, {1, -3, 7, -20, -122, 11, -3, 1},
+ {1, -2, 6, -18, -123, 10, -3, 1}, {0, -2, 6, -15, -124, 9, -3, 1},
+ {0, -2, 5, -13, -125, 8, -2, 1}, {0, -1, 4, -11, -125, 7, -2, 0},
+ {0, -1, 3, -8, -126, 6, -2, 0}, {0, -1, 3, -6, -127, 4, -1, 0},
+ {0, -1, 2, -4, -127, 3, -1, 0}, {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 16) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ remainder = subpixel_x & kSuperResScaleMask;
+ filter = LoadHi8(filter,
+ kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 15 extra upscaled pixels which will
+ // over-read up to 15 downscaled pixels in the end of each row.
+ // kSuperResHorizontalPadding protects this behavior from segmentation
+ // faults and threading issues.
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 16) {
+ // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+ // It's fine to write uninitialized bytes outside the frame, but the
+ // inside-frame pixels are incorrectly labeled uninitialized if
+ // uninitialized values go through the hadd intrinsics.
+ // |src| is offset 4 pixels to the left, and there are 4 extended border
+ // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+ // bytes. A difference of 1 indicates 7 good bytes.
+ const int msan_bytes_lo =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ __m128i s =
+ LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
+ subpixel_x += step;
+ const int msan_bytes_hi =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+ msan_bytes_hi);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_maddubs_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+ Transpose2x16_U16(a, a);
+ a[0] = _mm_adds_epi16(a[0], a[1]);
+ a[1] = _mm_adds_epi16(a[2], a[3]);
+ const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+ a[0] = _mm_subs_epi16(rounding, a[0]);
+ a[1] = _mm_subs_epi16(rounding, a[1]);
+ a[0] = _mm_srai_epi16(a[0], kFilterBits);
+ a[1] = _mm_srai_epi16(a[1], kFilterBits);
+ StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+ dsp->super_res = SuperRes_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+alignas(16) const int16_t
+ kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, -1, 128, 2, -1, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0}, {0, 1, -4, 127, 6, -3, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0}, {0, 2, -7, 125, 11, -4, 1, 0},
+ {-1, 2, -8, 125, 13, -5, 2, 0}, {-1, 3, -9, 124, 15, -6, 2, 0},
+ {-1, 3, -10, 123, 18, -6, 2, -1}, {-1, 3, -11, 122, 20, -7, 3, -1},
+ {-1, 4, -12, 121, 22, -8, 3, -1}, {-1, 4, -13, 120, 25, -9, 3, -1},
+ {-1, 4, -14, 118, 28, -9, 3, -1}, {-1, 4, -15, 117, 30, -10, 4, -1},
+ {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
+ {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
+ {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
+ {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
+ {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
+ {-1, 6, -20, 97, 58, -17, 6, -1}, {-1, 6, -20, 95, 61, -18, 6, -1},
+ {-2, 7, -20, 93, 64, -18, 6, -2}, {-2, 7, -20, 91, 66, -19, 6, -1},
+ {-2, 7, -20, 88, 69, -19, 6, -1}, {-2, 7, -20, 86, 71, -19, 6, -1},
+ {-2, 7, -20, 84, 74, -20, 7, -2}, {-2, 7, -20, 81, 76, -20, 7, -1},
+ {-2, 7, -20, 79, 79, -20, 7, -2}, {-1, 7, -20, 76, 81, -20, 7, -2},
+ {-2, 7, -20, 74, 84, -20, 7, -2}, {-1, 6, -19, 71, 86, -20, 7, -2},
+ {-1, 6, -19, 69, 88, -20, 7, -2}, {-1, 6, -19, 66, 91, -20, 7, -2},
+ {-2, 6, -18, 64, 93, -20, 7, -2}, {-1, 6, -18, 61, 95, -20, 6, -1},
+ {-1, 6, -17, 58, 97, -20, 6, -1}, {-1, 6, -17, 56, 99, -20, 6, -1},
+ {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
+ {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
+ {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
+ {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
+ {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
+ {-1, 3, -9, 28, 118, -14, 4, -1}, {-1, 3, -9, 25, 120, -13, 4, -1},
+ {-1, 3, -8, 22, 121, -12, 4, -1}, {-1, 3, -7, 20, 122, -11, 3, -1},
+ {-1, 2, -6, 18, 123, -10, 3, -1}, {0, 2, -6, 15, 124, -9, 3, -1},
+ {0, 2, -5, 13, 125, -8, 2, -1}, {0, 1, -4, 11, 125, -7, 2, 0},
+ {0, 1, -3, 8, 126, -6, 2, 0}, {0, 1, -3, 6, 127, -4, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0}, {0, 0, -1, 2, 128, -1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 8) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalPadding accounts for this.
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 8) {
+ const __m128i s =
+ LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_madd_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+ a[0] = _mm_hadd_epi32(a[0], a[1]);
+ a[1] = _mm_hadd_epi32(a[2], a[3]);
+ a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+ a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(
+ _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+ StoreAligned16(dst_ptr, clipped_16);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+ static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+ dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+ static_cast<void>(SuperRes_SSE4_1);
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 10 11 20 21 30 31
+ // in[1]: 40 41 50 51 60 61 70 71
+ // in[2]: 80 81 90 91 a0 a1 b0 b1
+ // in[3]: c0 c1 d0 d1 e0 e1 f0 f1
+ // to:
+ // a0: 00 40 01 41 10 50 11 51
+ // a1: 20 60 21 61 30 70 31 71
+ // a2: 80 c0 81 c1 90 d0 91 d1
+ // a3: a0 e0 a1 e1 b0 f0 b1 f1
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+ // b0: 00 20 40 60 01 21 41 61
+ // b1: 10 30 50 70 11 31 51 71
+ // b2: 80 a0 c0 e0 81 a1 c1 e1
+ // b3: 90 b0 d0 f0 91 b1 d1 f1
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 80 90 a0 b0 c0 d0 e0 f0
+ // out[3]: 81 91 a1 b1 c1 d1 e1 f1
+ out[0] = _mm_unpacklo_epi16(b0, b1);
+ out[1] = _mm_unpackhi_epi16(b0, b1);
+ out[2] = _mm_unpacklo_epi16(b2, b3);
+ out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+ __m128i* out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // out[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // out[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // out[2]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // out[3]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi32(b0, b1);
+ out[1] = _mm_unpackhi_epi32(b0, b1);
+ out[2] = _mm_unpacklo_epi32(b2, b3);
+ out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // ba: 00 10 01 11 02 12 03 13
+ // dc: 20 30 21 31 22 32 23 33
+ const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+ // Unpack 32 bit elements resulting in:
+ // dcba_lo: 00 10 20 30 01 11 21 31
+ // dcba_hi: 02 12 22 32 03 13 23 33
+ const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+ const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+ // Assign or shift right by 8 bytes resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 03 13 23 33
+ // out[3]: 03 13 23 33 XX XX XX XX
+ out[0] = dcba_lo;
+ out[1] = _mm_srli_si128(dcba_lo, 8);
+ out[2] = dcba_hi;
+ out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+ const __m128i filter_1,
+ const __m128i& src_window) {
+ const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+ const __m128i src =
+ _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+ return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+ (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+ const __m128i src_row,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadLo8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+ Transpose8x8To4x16_U8(filter, filter);
+ // |filter| now contains two filters per register.
+ // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+ // without overflowing the sign bit. The sign bit is hit only where two taps
+ // paired in a single madd add up to more than 128. This is only possible with
+ // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+ // even guarantees safety. |sum| is given a negative offset to allow for large
+ // intermediate values.
+ // k = 0, 2.
+ __m128i src_row_window = src_row;
+ __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+ sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+ // k = 1, 3.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+ _mm_srli_si128(filter[1], 8), src_row_window);
+ // k = 4, 6.
+ src_row_window = _mm_srli_si128(src_row_window, 3);
+ sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+ // k = 5, 7.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+ _mm_srli_si128(filter[3], 8), src_row_window);
+
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+ StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t intermediate_result[15][8], int y,
+ void* LIBGAV1_RESTRICT dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+ __m128i sum_high = sum_low;
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+ const __m128i intermediate_1 =
+ LoadUnaligned16(intermediate_result[y + k + 1]);
+ const __m128i intermediate_low =
+ _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+ const __m128i intermediate_high =
+ _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+ const __m128i product_high =
+ _mm_madd_epi16(filters_high, intermediate_high);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t* LIBGAV1_RESTRICT
+ intermediate_result_column,
+ void* LIBGAV1_RESTRICT dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_setzero_si128();
+ __m128i sum_high = _mm_setzero_si128();
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ // Equivalent to unpacking two vectors made by duplicating int16_t values.
+ const __m128i intermediate =
+ _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+ intermediate_result_column[k]);
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+ const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
+ int delta, DestType* LIBGAV1_RESTRICT dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+ int64_t y4, int gamma, int delta,
+ DestType* LIBGAV1_RESTRICT dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width,
+ int source_height, int ix4, int iy4,
+ DestType* LIBGAV1_RESTRICT dst_row,
+ ptrdiff_t dest_stride) {
+ // Region 1
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+ if (is_compound) {
+ const __m128i sum =
+ _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+ kInterRoundBitsCompoundVertical));
+ StoreUnaligned16(dst_row, sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width, int64_t y4,
+ int ix4, int iy4, int gamma, int delta,
+ int16_t intermediate_result_column[15],
+ DestType* LIBGAV1_RESTRICT dst_row,
+ ptrdiff_t dest_stride) {
+ // Region 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Region 2 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+ delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_height, int alpha,
+ int beta, int64_t x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
+ // Region 3
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int alpha, int beta,
+ int64_t x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
+ // Region 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width,
+ int source_height,
+ const int* LIBGAV1_RESTRICT warp_params,
+ int subsampling_x, int subsampling_y, int src_x,
+ int src_y, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta,
+ DestType* LIBGAV1_RESTRICT dst_row,
+ ptrdiff_t dest_stride) {
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can start with a negative offset and restore it on the
+ // final filter sum.
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
+ // Outside the frame in both directions. One repeated value.
+ WarpRegion1<is_compound, DestType>(
+ src, source_stride, source_width, source_height, filter_params.ix4,
+ filter_params.iy4, dst_row, dest_stride);
+ return;
+ }
+ // Outside the frame horizontally. Rows repeated.
+ WarpRegion2<is_compound, DestType>(
+ src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+ filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+ dest_stride);
+ return;
+ }
+
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
+ // Outside the frame vertically.
+ WarpRegion3<is_compound, DestType>(
+ src, source_stride, source_height, alpha, beta, filter_params.x4,
+ filter_params.ix4, filter_params.iy4, intermediate_result);
+ } else {
+ // Inside the frame.
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+ filter_params.x4, filter_params.ix4,
+ filter_params.iy4, intermediate_result);
+ }
+ // Region 3 and 4 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+ gamma, delta, dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x, int block_start_y,
+ int block_width, int block_height, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
+ ptrdiff_t dest_stride) {
+ const auto* const src = static_cast<const uint8_t*>(source);
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint8_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+
+ // Warp process applies for each 8x8 block.
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+ const int block_end_x = block_start_x + block_width;
+ const int block_end_y = block_start_y + block_height;
+
+ const int start_x = block_start_x;
+ const int start_y = block_start_y;
+ int src_x = (start_x + 4) << subsampling_x;
+ int src_y = (start_y + 4) << subsampling_y;
+ const int end_x = (block_end_x + 4) << subsampling_x;
+ const int end_y = (block_end_y + 4) << subsampling_y;
+ do {
+ DestType* dst_row = dst;
+ src_x = (start_x + 4) << subsampling_x;
+ do {
+ HandleWarpBlock<is_compound, DestType>(
+ src, source_stride, source_width, source_height, warp_params,
+ subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+ dst_row, dest_stride);
+ src_x += (8 << subsampling_x);
+ dst_row += 8;
+ } while (src_x < end_x);
+ dst += 8 * dest_stride;
+ src_y += (8 << subsampling_y);
+ } while (src_y < end_y);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+ dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i difference_0 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+ const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i difference_1 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+ const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
+ const __m128i difference_offset = _mm_set1_epi8(38);
+ const __m128i adjusted_difference =
+ _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
+ difference_offset);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
+ } else {
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
+ }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE \
+ WEIGHT8_PAIR_WITHOUT_STRIDE; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 3;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 5;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT16_AND_STRIDE \
+ WEIGHT16_WITHOUT_STRIDE; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 7;
+ do {
+ WEIGHT16_AND_STRIDE;
+ } while (--y != 0);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (--y5 != 0);
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE \
+ WEIGHT32_WITHOUT_STRIDE; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (--y5 != 0);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE \
+ WEIGHT64_WITHOUT_STRIDE; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 42);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 42);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_SSE4_1<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_SSE4_1<1>
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+ const __m128i diff_offset = _mm_set1_epi8(38);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i zero = _mm_setzero_si128();
+
+ // Range of prediction: [3988, 61532].
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+ const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+ const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+ const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+ const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+ const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+ const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+ const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+ const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+ const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+ const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+ const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+ const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+ const __m128i adjusted_diff = _mm_adds_epu8(
+ _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+ const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
+ } else {
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
+ }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 3;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 5;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+ WEIGHT16_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = 7;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+ WEIGHT32_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+ WEIGHT64_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_10bpp_SSE4_1<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_10bpp_SSE4_1<1>
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+namespace {
+
+// The kGaussianSequence array contains random samples from a Gaussian
+// distribution with zero mean and standard deviation of about 512 clipped to
+// the range of [-2048, 2047] (representable by a signed integer using 12 bits
+// of precision) and rounded to the nearest multiple of 4.
+//
+// Note: It is important that every element in the kGaussianSequence array be
+// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
+// less than 128 for bitdepth=8 (GrainType=int8_t).
+constexpr int16_t kGaussianSequence[/*2048*/] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484};
+static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
+ "");
+
+// The number of rows in a contiguous group computed by a single worker thread
+// before checking for the next available group.
+constexpr int kFrameChunkHeight = 8;
+
+// |width| and |height| refer to the plane, not the frame, meaning any
+// subsampling should be applied by the caller.
+template <typename Pixel>
+inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
+ int width, int height, uint8_t* dest_plane,
+ ptrdiff_t dest_stride) {
+ // If it's the same buffer there's nothing to do.
+ if (source_plane == dest_plane) return;
+
+ int y = 0;
+ do {
+ memcpy(dest_plane, source_plane, width * sizeof(Pixel));
+ source_plane += source_stride;
+ dest_plane += dest_stride;
+ } while (++y < height);
+}
+
+} // namespace
+
+template <int bitdepth>
+FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
+ bool is_monochrome,
+ bool color_matrix_is_identity, int subsampling_x,
+ int subsampling_y, int width, int height,
+ ThreadPool* thread_pool)
+ : params_(params),
+ is_monochrome_(is_monochrome),
+ color_matrix_is_identity_(color_matrix_is_identity),
+ subsampling_x_(subsampling_x),
+ subsampling_y_(subsampling_y),
+ width_(width),
+ height_(height),
+ template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
+ : kMaxChromaWidth),
+ template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
+ : kMaxChromaHeight),
+ thread_pool_(thread_pool) {}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::Init() {
+ // Section 7.18.3.3. Generate grain process.
+ const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+ // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
+ // need to generate it.
+ const bool use_luma = params_.num_y_points > 0;
+ if (use_luma) {
+ GenerateLumaGrain(params_, luma_grain_);
+ // If params_.auto_regression_coeff_lag is 0, the filter is the identity
+ // filter and therefore can be skipped.
+ if (params_.auto_regression_coeff_lag > 0) {
+ dsp.film_grain
+ .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
+ params_, luma_grain_);
+ }
+ } else {
+ // Have AddressSanitizer warn if luma_grain_ is used.
+ ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
+ }
+ if (!is_monochrome_) {
+ GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
+ u_grain_, v_grain_);
+ if (params_.auto_regression_coeff_lag > 0 || use_luma) {
+ dsp.film_grain.chroma_auto_regression[static_cast<int>(
+ use_luma)][params_.auto_regression_coeff_lag](
+ params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
+ v_grain_);
+ }
+ }
+
+ // Section 7.18.3.4. Scaling lookup initialization process.
+
+ // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
+ // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
+ // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
+ // set up as aliases. So we need to initialize scaling_lut_y_ under these
+ // two conditions.
+ //
+ // Note: Although it does not seem to make sense, there are test vectors
+ // with chroma_scaling_from_luma=true and params_.num_y_points=0.
+#if LIBGAV1_MSAN
+ // Quiet film grain / md5 msan warnings.
+ memset(scaling_lut_y_, 0, sizeof(scaling_lut_y_));
+#endif
+ if (use_luma || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
+ scaling_lut_y_, kScalingLutLength);
+ } else {
+ ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
+ }
+ if (!is_monochrome_) {
+ if (params_.chroma_scaling_from_luma) {
+ scaling_lut_u_ = scaling_lut_y_;
+ scaling_lut_v_ = scaling_lut_y_;
+ } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
+ const size_t buffer_size =
+ kScalingLutLength * (static_cast<int>(params_.num_u_points > 0) +
+ static_cast<int>(params_.num_v_points > 0));
+ scaling_lut_chroma_buffer_.reset(new (std::nothrow) int16_t[buffer_size]);
+ if (scaling_lut_chroma_buffer_ == nullptr) return false;
+
+ int16_t* buffer = scaling_lut_chroma_buffer_.get();
+#if LIBGAV1_MSAN
+ // Quiet film grain / md5 msan warnings.
+ memset(buffer, 0, buffer_size * 2);
+#endif
+ if (params_.num_u_points > 0) {
+ scaling_lut_u_ = buffer;
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_u_points, params_.point_u_value,
+ params_.point_u_scaling, scaling_lut_u_, kScalingLutLength);
+ buffer += kScalingLutLength;
+ }
+ if (params_.num_v_points > 0) {
+ scaling_lut_v_ = buffer;
+ dsp.film_grain.initialize_scaling_lut(
+ params_.num_v_points, params_.point_v_value,
+ params_.point_v_scaling, scaling_lut_v_, kScalingLutLength);
+ }
+ }
+ }
+ return true;
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
+ GrainType* luma_grain) {
+ // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
+ // the luma_grain array to all zeros. But the Note at the end of Section
+ // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
+ // call GenerateLumaGrain if params.num_y_points is equal to 0.
+ assert(params.num_y_points > 0);
+ const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+ uint16_t seed = params.grain_seed;
+ GrainType* luma_grain_row = luma_grain;
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ luma_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ }
+ luma_grain_row += kLumaWidth;
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
+ int chroma_width,
+ int chroma_height,
+ GrainType* u_grain,
+ GrainType* v_grain) {
+ const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+ if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
+ memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
+ } else {
+ uint16_t seed = params.grain_seed ^ 0xb524;
+ GrainType* u_grain_row = u_grain;
+ assert(chroma_width > 0);
+ assert(chroma_height > 0);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ u_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ } while (++x < chroma_width);
+
+ u_grain_row += chroma_width;
+ } while (++y < chroma_height);
+ }
+ if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
+ memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
+ } else {
+ GrainType* v_grain_row = v_grain;
+ uint16_t seed = params.grain_seed ^ 0x49d8;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ v_grain_row[x] = RightShiftWithRounding(
+ kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+ } while (++x < chroma_width);
+
+ v_grain_row += chroma_width;
+ } while (++y < chroma_height);
+ }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
+ const int half_height = DivideBy2(height_ + 1);
+ assert(half_height > 0);
+ // ceil(half_height / 16.0)
+ const int max_luma_num = DivideBy16(half_height + 15);
+ constexpr int kNoiseStripeHeight = 34;
+ size_t noise_buffer_size = kNoiseStripePadding;
+ if (params_.num_y_points > 0) {
+ noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
+ }
+ if (!is_monochrome_) {
+ noise_buffer_size += 2 * max_luma_num *
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_);
+ }
+ noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
+ if (noise_buffer_ == nullptr) return false;
+ GrainType* noise_buffer = noise_buffer_.get();
+ if (params_.num_y_points > 0) {
+ noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
+ noise_buffer);
+ noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
+ }
+ if (!is_monochrome_) {
+ noise_stripes_[kPlaneU].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
+ noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_);
+ noise_stripes_[kPlaneV].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
+ }
+ return true;
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseImage() {
+ // When LIBGAV1_MSAN is enabled, zero initialize to quiet optimized film grain
+ // msan warnings.
+ constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
+ if (params_.num_y_points > 0 &&
+ !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+ zero_initialize)) {
+ return false;
+ }
+ if (!is_monochrome_) {
+ if (!noise_image_[kPlaneU].Reset(
+ (height_ + subsampling_y_) >> subsampling_y_,
+ ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+ zero_initialize)) {
+ return false;
+ }
+ if (!noise_image_[kPlaneV].Reset(
+ (height_ + subsampling_y_) >> subsampling_y_,
+ ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+ zero_initialize)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Uses |overlap_flag| to skip rows that are covered by the overlap computation.
+template <int bitdepth>
+void FilmGrain<bitdepth>::ConstructNoiseImage(
+ const Array2DView<GrainType>* noise_stripes, int width, int height,
+ int subsampling_x, int subsampling_y, int stripe_start_offset,
+ Array2D<GrainType>* noise_image) {
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = 0;
+ // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
+ // to either 16 or 32.
+ const GrainType* first_noise_stripe = (*noise_stripes)[0];
+ do {
+ memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
+ plane_width * sizeof(first_noise_stripe[0]));
+ } while (++y < std::min(stripe_height, plane_height));
+ // End special iterations for luma_num == 0.
+
+ int luma_num = 1;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ int i = stripe_start_offset;
+ do {
+ memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+ plane_width * sizeof(noise_stripe[0]));
+ } while (++i < stripe_height);
+ }
+
+ // If there is a partial stripe, copy any rows beyond the overlap rows.
+ const int remaining_height = plane_height - y;
+ if (remaining_height > stripe_start_offset) {
+ assert(luma_num < noise_stripes->rows());
+ const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+ int i = stripe_start_offset;
+ do {
+ memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+ plane_width * sizeof(noise_stripe[0]));
+ } while (++i < remaining_height);
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
+ const dsp::Dsp& dsp, const Plane* planes, int num_planes,
+ std::atomic<int>* job_counter, int min_value, int max_chroma,
+ const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
+ ptrdiff_t dest_stride_uv) {
+ assert(num_planes > 0);
+ const int full_jobs_per_plane = height_ / kFrameChunkHeight;
+ const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+ const int total_full_jobs = full_jobs_per_plane * num_planes;
+ // If the frame height is not a multiple of kFrameChunkHeight, one job with
+ // a smaller number of rows is necessary at the end of each plane.
+ const int total_jobs =
+ total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
+ int job_index;
+ // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
+ // plane. dsp->blend_noise_chroma handles subsampling.
+ // This loop body handles a slice of one plane or the other, depending on
+ // which are active. That way, threads working on consecutive jobs will keep
+ // the same region of luma source in working memory.
+ while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+ total_jobs) {
+ const Plane plane = planes[job_index % num_planes];
+ const int slice_index = job_index / num_planes;
+ const int start_height = slice_index * kFrameChunkHeight;
+ const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+ const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+ source_plane_y + start_height * source_stride_y);
+ const int16_t* scaling_lut_uv;
+ const uint8_t* source_plane_uv;
+ uint8_t* dest_plane_uv;
+
+ if (plane == kPlaneU) {
+ scaling_lut_uv = scaling_lut_u_;
+ source_plane_uv = source_plane_u;
+ dest_plane_uv = dest_plane_u;
+ } else {
+ assert(plane == kPlaneV);
+ scaling_lut_uv = scaling_lut_v_;
+ source_plane_uv = source_plane_v;
+ dest_plane_uv = dest_plane_v;
+ }
+ const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
+ source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
+ auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
+ dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
+ start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
+ source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
+ dest_cursor_uv, dest_stride_uv);
+ }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
+ const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
+ int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
+ const int total_full_jobs = height_ / kFrameChunkHeight;
+ const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+ const int total_jobs =
+ total_full_jobs + static_cast<int>(remainder_job_height > 0);
+ int job_index;
+ // Each job is some number of rows in a plane.
+ while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+ total_jobs) {
+ const int start_height = job_index * kFrameChunkHeight;
+ const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+ const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+ source_plane_y + start_height * source_stride_y);
+ auto* dest_cursor_y =
+ reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
+ dsp.film_grain.blend_noise_luma(
+ noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+ job_height, start_height, scaling_lut_y_, source_cursor_y,
+ source_stride_y, dest_cursor_y, dest_stride_y);
+ }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AddNoise(
+ const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
+ uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
+ if (!Init()) {
+ LIBGAV1_DLOG(ERROR, "Init() failed.");
+ return false;
+ }
+ if (!AllocateNoiseStripes()) {
+ LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
+ return false;
+ }
+
+ const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+ const bool use_luma = params_.num_y_points > 0;
+
+ // Construct noise stripes.
+ if (use_luma) {
+ // The luma plane is never subsampled.
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ luma_grain_, params_.grain_seed, width_, height_,
+ /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
+ }
+ if (!is_monochrome_) {
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+ subsampling_y_, &noise_stripes_[kPlaneU]);
+ dsp.film_grain
+ .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+ v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+ subsampling_y_, &noise_stripes_[kPlaneV]);
+ }
+
+ if (!AllocateNoiseImage()) {
+ LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
+ return false;
+ }
+
+ // Construct noise image.
+ if (use_luma) {
+ ConstructNoiseImage(
+ &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+ /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
+ &noise_image_[kPlaneY]);
+ if (params_.overlap_flag) {
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+ /*subsampling_y=*/0, &noise_image_[kPlaneY]);
+ }
+ }
+ if (!is_monochrome_) {
+ ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
+ subsampling_x_, subsampling_y_,
+ static_cast<int>(params_.overlap_flag)
+ << (1 - subsampling_y_),
+ &noise_image_[kPlaneU]);
+ ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
+ subsampling_x_, subsampling_y_,
+ static_cast<int>(params_.overlap_flag)
+ << (1 - subsampling_y_),
+ &noise_image_[kPlaneV]);
+ if (params_.overlap_flag) {
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
+ subsampling_y_, &noise_image_[kPlaneU]);
+ dsp.film_grain.construct_noise_image_overlap(
+ &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
+ subsampling_y_, &noise_image_[kPlaneV]);
+ }
+ }
+
+ // Blend noise image.
+ int min_value;
+ int max_luma;
+ int max_chroma;
+ if (params_.clip_to_restricted_range) {
+ min_value = 16 << (bitdepth - kBitdepth8);
+ max_luma = 235 << (bitdepth - kBitdepth8);
+ if (color_matrix_is_identity_) {
+ max_chroma = max_luma;
+ } else {
+ max_chroma = 240 << (bitdepth - kBitdepth8);
+ }
+ } else {
+ min_value = 0;
+ max_luma = (256 << (bitdepth - kBitdepth8)) - 1;
+ max_chroma = max_luma;
+ }
+
+ // Handle all chroma planes first because luma source may be altered in place.
+ if (!is_monochrome_) {
+ // This is done in a strange way but Vector can't be passed by copy to the
+ // lambda capture that spawns the thread.
+ Plane planes_to_blend[2];
+ int num_planes = 0;
+ if (params_.chroma_scaling_from_luma) {
+ // Both noise planes are computed from the luma scaling lookup table.
+ planes_to_blend[num_planes++] = kPlaneU;
+ planes_to_blend[num_planes++] = kPlaneV;
+ } else {
+ const int height_uv = SubsampledValue(height_, subsampling_y_);
+ const int width_uv = SubsampledValue(width_, subsampling_x_);
+
+ // Noise is applied according to a lookup table defined by pieceiwse
+ // linear "points." If the lookup table is empty, that corresponds to
+ // outputting zero noise.
+ if (params_.num_u_points == 0) {
+ CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
+ height_uv, dest_plane_u, dest_stride_uv);
+ } else {
+ planes_to_blend[num_planes++] = kPlaneU;
+ }
+ if (params_.num_v_points == 0) {
+ CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
+ height_uv, dest_plane_v, dest_stride_uv);
+ } else {
+ planes_to_blend[num_planes++] = kPlaneV;
+ }
+ }
+ if (thread_pool_ != nullptr && num_planes > 0) {
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ std::atomic<int> job_counter(0);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
+ num_planes, &job_counter, min_value, max_chroma,
+ source_plane_y, source_stride_y, source_plane_u,
+ source_plane_v, source_stride_uv, dest_plane_u,
+ dest_plane_v, dest_stride_uv]() {
+ BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
+ min_value, max_chroma, source_plane_y,
+ source_stride_y, source_plane_u,
+ source_plane_v, source_stride_uv, dest_plane_u,
+ dest_plane_v, dest_stride_uv);
+ pending_workers.Decrement();
+ });
+ }
+ BlendNoiseChromaWorker(
+ dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
+ source_plane_y, source_stride_y, source_plane_u, source_plane_v,
+ source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
+
+ pending_workers.Wait();
+ } else {
+ // Single threaded.
+ if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
+ height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+ scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
+ source_stride_uv, dest_plane_u, dest_stride_uv);
+ }
+ if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
+ dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+ kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
+ height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+ scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
+ source_stride_uv, dest_plane_v, dest_stride_uv);
+ }
+ }
+ }
+ if (use_luma) {
+ if (thread_pool_ != nullptr) {
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ std::atomic<int> job_counter(0);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule(
+ [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
+ BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y,
+ dest_plane_y, dest_stride_y);
+ pending_workers.Decrement();
+ });
+ }
+
+ BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+ source_plane_y, source_stride_y, dest_plane_y,
+ dest_stride_y);
+ pending_workers.Wait();
+ } else {
+ dsp.film_grain.blend_noise_luma(
+ noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+ height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
+ source_stride_y, dest_plane_y, dest_stride_y);
+ }
+ } else {
+ CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
+ dest_plane_y, dest_stride_y);
+ }
+
+ return true;
+}
+
+// Explicit instantiations.
+template class FilmGrain<kBitdepth8>;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template class FilmGrain<kBitdepth10>;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+template class FilmGrain<kBitdepth12>;
+#endif
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FILM_GRAIN_H_
+#define LIBGAV1_SRC_FILM_GRAIN_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// Film grain synthesis function signature. Section 7.18.3.
+// This function generates film grain noise and blends the noise with the
+// decoded frame.
+// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane
+// buffers of the decoded frame. They are blended with the film grain noise and
+// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final
+// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or
+// v) may point to the same buffer, in which case the film grain noise is added
+// in place.
+// |film_grain_params| are parameters read from frame header.
+// |is_monochrome| is true indicates only Y plane needs to be processed.
+// |color_matrix_is_identity| is true if the matrix_coefficients field in the
+// sequence header's color config is is MC_IDENTITY.
+// |width| is the upscaled width of the frame.
+// |height| is the frame height.
+// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used
+// if |is_monochrome| is true.
+// Returns true on success, or false on failure (e.g., out of memory).
+using FilmGrainSynthesisFunc = bool (*)(
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_u, ptrdiff_t source_stride_u,
+ const void* source_plane_v, ptrdiff_t source_stride_v,
+ const FilmGrainParams& film_grain_params, bool is_monochrome,
+ bool color_matrix_is_identity, int width, int height, int subsampling_x,
+ int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y,
+ void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v,
+ ptrdiff_t dest_stride_v);
+
+// Section 7.18.3.5. Add noise synthesis process.
+template <int bitdepth>
+class FilmGrain {
+ public:
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ FilmGrain(const FilmGrainParams& params, bool is_monochrome,
+ bool color_matrix_is_identity, int subsampling_x, int subsampling_y,
+ int width, int height, ThreadPool* thread_pool);
+
+ // Note: These static methods are declared public so that the unit tests can
+ // call them.
+
+ static void GenerateLumaGrain(const FilmGrainParams& params,
+ GrainType* luma_grain);
+
+ // Generates white noise arrays u_grain and v_grain chroma_width samples wide
+ // and chroma_height samples high.
+ static void GenerateChromaGrains(const FilmGrainParams& params,
+ int chroma_width, int chroma_height,
+ GrainType* u_grain, GrainType* v_grain);
+
+ // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are
+ // subject to overlap.
+ static void ConstructNoiseImage(const Array2DView<GrainType>* noise_stripes,
+ int width, int height, int subsampling_x,
+ int subsampling_y, int stripe_start_offset,
+ Array2D<GrainType>* noise_image);
+
+ // Combines the film grain with the image data.
+ bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_y,
+ ptrdiff_t dest_stride_y, uint8_t* dest_plane_u,
+ uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ private:
+ using Pixel =
+ typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
+ static constexpr int kScalingLutLength =
+ (bitdepth == 10)
+ ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2
+ : kScalingLookupTableSize + kScalingLookupTablePadding;
+
+ bool Init();
+
+ // Allocates noise_stripes_.
+ bool AllocateNoiseStripes();
+
+ bool AllocateNoiseImage();
+
+ void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes,
+ int num_planes, std::atomic<int>* job_counter,
+ int min_value, int max_chroma,
+ const uint8_t* source_plane_y,
+ ptrdiff_t source_stride_y,
+ const uint8_t* source_plane_u,
+ const uint8_t* source_plane_v,
+ ptrdiff_t source_stride_uv, uint8_t* dest_plane_u,
+ uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic<int>* job_counter,
+ int min_value, int max_luma,
+ const uint8_t* source_plane_y,
+ ptrdiff_t source_stride_y, uint8_t* dest_plane_y,
+ ptrdiff_t dest_stride_y);
+
+ const FilmGrainParams& params_;
+ const bool is_monochrome_;
+ const bool color_matrix_is_identity_;
+ const int subsampling_x_;
+ const int subsampling_y_;
+ // Frame width and height.
+ const int width_;
+ const int height_;
+ // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are
+ // known as CbGrain and CrGrain.
+ // These templates are used to construct the noise image for each plane by
+ // copying 32x32 blocks with pseudorandom offsets, into "noise stripes."
+ // The noise template known as LumaGrain array is an 82x73 block.
+ // The height and width of the templates for chroma become 44 and 38 under
+ // subsampling, respectively.
+ // For more details see:
+ // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018
+ // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12.
+ const int template_uv_width_;
+ const int template_uv_height_;
+ // LumaGrain. The luma_grain array contains white noise generated for luma.
+ // The array size is fixed but subject to further optimization for SIMD.
+ GrainType luma_grain_[kLumaHeight * kLumaWidth];
+ // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is
+ // kMaxChromaHeight * kMaxChromaWidth. The actual size is
+ // template_uv_height_ * template_uv_width_.
+ GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth];
+ GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth];
+ // Scaling lookup tables.
+ int16_t scaling_lut_y_[kScalingLutLength];
+ int16_t* scaling_lut_u_ = nullptr;
+ int16_t* scaling_lut_v_ = nullptr;
+ // If allocated, this buffer is 256 * 2 values long and scaling_lut_u_ and
+ // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and
+ // scaling_lut_v_ point to scaling_lut_y_.
+ std::unique_ptr<int16_t[]> scaling_lut_chroma_buffer_;
+
+ // A two-dimensional array of noise data for each plane. Generated for each 32
+ // luma sample high stripe of the image. The first dimension is called
+ // luma_num. The second dimension is the size of one noise stripe.
+ //
+ // Each row of the Array2DView noise_stripes_[plane] is a conceptually
+ // two-dimensional array of |GrainType|s. The two-dimensional array of
+ // |GrainType|s is flattened into a one-dimensional buffer in this
+ // implementation.
+ //
+ // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and
+ // |width_| columns and contains noise for the luma component.
+ //
+ // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
+ // is an array that has (34 >> subsampling_y_) rows and
+ // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+ // chroma components.
+ Array2DView<GrainType> noise_stripes_[kMaxPlanes];
+ // Owns the memory that the elements of noise_stripes_ point to.
+ std::unique_ptr<GrainType[]> noise_buffer_;
+
+ Array2D<GrainType> noise_image_[kMaxPlanes];
+ ThreadPool* const thread_pool_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FILM_GRAIN_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/film_grain.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+constexpr int kNumSpeedTests = 50;
+constexpr int kNumFilmGrainTestParams = 10;
+constexpr size_t kLumaBlockSize = kLumaWidth * kLumaHeight;
+constexpr size_t kChromaBlockSize = kMaxChromaWidth * kMaxChromaHeight;
+// Dimensions for unit tests concerning applying grain to the whole frame.
+constexpr size_t kNumTestStripes = 64;
+constexpr int kNoiseStripeHeight = 34;
+constexpr size_t kFrameWidth = 1921;
+constexpr size_t kFrameHeight = (kNumTestStripes - 1) * 32 + 1;
+
+/*
+ The film grain parameters for 10 frames were generated with the following
+ command line:
+ aomenc --end-usage=q --cq-level=20 --cpu-used=8 -w 1920 -h 1080 \
+ --denoise-noise-level=50 --ivf breaking_bad_21m23s_10frames.1920_1080.yuv \
+ -o breaking_bad_21m23s_10frames.1920_1080.noise50.ivf
+*/
+constexpr FilmGrainParams kFilmGrainParams[10] = {
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/8,
+ /*num_v_points=*/8,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{71, 71, 91, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 27, 40, 54, 67, 94, 255, 0, 0},
+ /*point_u_scaling=*/{37, 37, 43, 48, 48, 50, 51, 51, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 107, 255, 0, 0},
+ /*point_v_scaling=*/{48, 48, 43, 33, 32, 33, 34, 34, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 10, 3, -2, 1, -4,
+ 5, -1, -25, -13, 3, -1, 0, 7,
+ -20, 103, 26, -2, 1, 14, -49, 117},
+ /*auto_regression_coeff_u=*/{-2, 1, -3, 4, -4, 0, 3, 5, -5,
+ -17, 17, 0, -10, -5, -3, -30, 14, 70,
+ 29, 9, -2, -10, 50, 71, -11},
+ /*auto_regression_coeff_v=*/{3, -2, -7, 6, -7, -8, 3, 1, -12,
+ -15, 28, 5, -11, -2, -7, -27, 32, 62,
+ 31, 18, -2, -6, 61, 43, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/7391,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/8,
+ /*num_u_points=*/7,
+ /*num_v_points=*/8,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 91, 99, 97, 100, 102, 102, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+ /*point_u_scaling=*/{38, 38, 50, 49, 51, 53, 53, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+ /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 37, 37, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 10, 3, -1, 1, -3,
+ 3, 1, -27, -12, 2, -1, 1, 7,
+ -17, 100, 27, 0, -1, 13, -50, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 2, 5, -3,
+ -16, 16, -2, -10, -2, -1, -31, 14, 70,
+ 29, 9, -1, -10, 47, 70, -11},
+ /*auto_regression_coeff_v=*/{1, 0, -5, 5, -6, -6, 2, 1, -10,
+ -14, 26, 4, -10, -3, -5, -26, 29, 63,
+ 31, 17, -1, -6, 55, 47, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/10772,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/8,
+ /*num_u_points=*/7,
+ /*num_v_points=*/8,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+ /*point_u_scaling=*/{37, 37, 49, 48, 51, 52, 52, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+ /*point_v_scaling=*/{49, 49, 44, 34, 32, 34, 36, 36, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{1, -2, -2, 10, 3, -1, 1, -4,
+ 4, 1, -26, -12, 2, -1, 1, 7,
+ -18, 101, 26, -1, 0, 13, -49, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -3, 4, -3, -1, 2, 5, -4,
+ -16, 17, -2, -10, -3, -2, -31, 15, 70,
+ 28, 9, -1, -10, 48, 70, -11},
+ /*auto_regression_coeff_v=*/{1, -1, -6, 5, -6, -7, 2, 2, -11,
+ -14, 27, 5, -11, -3, -6, -26, 30, 62,
+ 30, 18, -2, -6, 58, 45, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/14153,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/5,
+ /*num_v_points=*/7,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{71, 71, 90, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 107, 255, 0, 0, 0, 0, 0},
+ /*point_u_scaling=*/{37, 37, 48, 51, 51, 0, 0, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+ /*point_v_scaling=*/{49, 49, 43, 33, 32, 34, 34, 0, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 10, 3, -1, 1, -4,
+ 6, 0, -26, -13, 3, -1, 1, 6,
+ -20, 103, 26, -2, 1, 13, -48, 117},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 4, -4, -1, 2, 5, -5,
+ -16, 18, -1, -10, -3, -2, -30, 16, 69,
+ 28, 9, -2, -10, 50, 68, -11},
+ /*auto_regression_coeff_v=*/{2, -1, -6, 5, -6, -7, 2, 2, -11,
+ -15, 29, 4, -10, -3, -6, -26, 30, 62,
+ 31, 18, -3, -6, 59, 45, 3},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/17534,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/8,
+ /*num_u_points=*/7,
+ /*num_v_points=*/7,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+ /*point_u_scaling=*/{37, 37, 49, 49, 52, 53, 53, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+ /*point_v_scaling=*/{50, 50, 44, 34, 33, 36, 37, 0, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 10, 3, -1, 1, -4,
+ 3, 1, -26, -12, 2, -1, 1, 7,
+ -17, 101, 26, 0, 0, 13, -50, 116},
+ /*auto_regression_coeff_u=*/{-2, 1, -2, 3, -3, -1, 2, 5, -4,
+ -16, 16, -2, -10, -3, -1, -31, 14, 70,
+ 28, 9, -1, -10, 48, 70, -11},
+ /*auto_regression_coeff_v=*/{1, 0, -5, 5, -6, -6, 2, 2, -10,
+ -14, 26, 4, -10, -3, -5, -26, 29, 63,
+ 30, 17, -1, -6, 56, 47, 3},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/20915,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/7,
+ /*num_v_points=*/7,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 107, 255, 0, 0, 0},
+ /*point_u_scaling=*/{38, 38, 51, 50, 52, 53, 54, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+ /*point_v_scaling=*/{51, 51, 45, 35, 33, 36, 36, 0, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 9, 3, -1, 1, -3,
+ 2, 2, -27, -12, 2, 0, 1, 7,
+ -16, 100, 27, 0, -1, 13, -51, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 1, 4, -2,
+ -17, 14, -3, -10, -2, 0, -31, 14, 71,
+ 29, 8, -2, -10, 45, 71, -11},
+ /*auto_regression_coeff_v=*/{0, -1, -5, 4, -6, -5, 2, 1, -9,
+ -14, 24, 3, -10, -3, -4, -25, 29, 63,
+ 31, 16, -1, -7, 54, 48, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/24296,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/7,
+ /*num_v_points=*/8,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+ /*point_u_scaling=*/{38, 38, 50, 50, 51, 53, 53, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+ /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 36, 36, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{2, -2, -2, 10, 3, -1, 1, -3,
+ 3, 2, -27, -12, 2, 0, 1, 7,
+ -17, 100, 27, 0, -1, 13, -51, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 1, 5, -3,
+ -16, 15, -2, -10, -2, -1, -31, 14, 70,
+ 29, 8, -1, -10, 46, 71, -11},
+ /*auto_regression_coeff_v=*/{1, 0, -5, 5, -6, -5, 2, 1, -9,
+ -14, 25, 4, -10, -3, -5, -25, 29, 63,
+ 31, 17, -1, -7, 55, 47, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/27677,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/7,
+ /*num_v_points=*/8,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 92, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 174, 255, 0, 0, 0},
+ /*point_u_scaling=*/{38, 38, 51, 50, 52, 54, 54, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+ /*point_v_scaling=*/{51, 51, 46, 35, 33, 35, 37, 37, 0, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{1, -1, -2, 9, 3, -1, 1, -3,
+ 2, 2, -28, -12, 2, 0, 1, 8,
+ -16, 99, 27, 0, -1, 13, -51, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 2, 4, -2,
+ -16, 14, -3, -10, -2, 0, -31, 13, 71,
+ 29, 8, -2, -11, 44, 72, -11},
+ /*auto_regression_coeff_v=*/{0, -1, -5, 4, -6, -4, 2, 1, -9,
+ -13, 23, 3, -10, -3, -4, -25, 28, 63,
+ 32, 16, -1, -7, 54, 49, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/31058,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/7,
+ /*num_v_points=*/9,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 92, 99, 98, 100, 98, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+ /*point_u_scaling=*/{38, 38, 51, 51, 52, 54, 54, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+ /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 37, 37, 37, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{1, -1, -2, 9, 3, -1, 1, -3,
+ 2, 2, -28, -12, 2, 0, 1, 8,
+ -16, 99, 27, 0, -1, 13, -52, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 1, 4, -2,
+ -16, 13, -3, -10, -2, 0, -31, 13, 71,
+ 29, 8, -2, -11, 44, 72, -11},
+ /*auto_regression_coeff_v=*/{0, -1, -5, 4, -6, -4, 2, 2, -8,
+ -13, 23, 3, -10, -3, -4, -25, 28, 63,
+ 32, 16, -1, -7, 54, 49, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/34439,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0},
+ {/*apply_grain=*/true,
+ /*update_grain=*/true,
+ /*chroma_scaling_from_luma=*/false,
+ /*overlap_flag=*/true,
+ /*clip_to_restricted_range=*/false,
+ /*num_y_points=*/7,
+ /*num_u_points=*/7,
+ /*num_v_points=*/9,
+ /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+ /*point_y_scaling=*/{72, 72, 92, 99, 98, 99, 95, 0, 0, 0, 0, 0, 0, 0},
+ /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+ /*point_u_scaling=*/{39, 39, 51, 51, 52, 54, 54, 0, 0, 0},
+ /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+ /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 36, 35, 35, 0},
+ /*chroma_scaling=*/11,
+ /*auto_regression_coeff_lag=*/3,
+ /*auto_regression_coeff_y=*/{1, -1, -2, 9, 3, -1, 1, -3,
+ 2, 2, -28, -11, 2, 0, 1, 8,
+ -16, 99, 27, 0, -1, 13, -52, 116},
+ /*auto_regression_coeff_u=*/{-3, 1, -2, 3, -3, -1, 1, 4, -2,
+ -16, 13, -3, -10, -2, 0, -30, 13, 71,
+ 29, 8, -2, -10, 43, 72, -11},
+ /*auto_regression_coeff_v=*/{0, -1, -5, 3, -6, -4, 2, 2, -8,
+ -13, 23, 3, -10, -3, -4, -25, 28, 64,
+ 32, 16, -1, -7, 53, 49, 2},
+ /*auto_regression_shift=*/8,
+ /*grain_seed=*/37820,
+ /*reference_index=*/0,
+ /*grain_scale_shift=*/0,
+ /*u_multiplier=*/0,
+ /*u_luma_multiplier=*/64,
+ /*u_offset=*/0,
+ /*v_multiplier=*/0,
+ /*v_luma_multiplier=*/64,
+ /*v_offset=*/0}};
+
+const char* GetTestDigestLuma(int bitdepth, int param_index) {
+ static const char* const kTestDigestsLuma8bpp[10] = {
+ "80da8e849110a10c0a73f9dec0d9a2fb", "54352f02aeda541e17a4c2d208897e2b",
+ "2ad9021124c82aca3e7c9517d00d1236", "f6c5f64513925b09ceba31e92511f8a1",
+ "46c6006578c68c3c8619f7a389c7de45", "fcddbd27545254dc50f1c333c8b7e313",
+ "c6d4dc181bf7f2f93ae099b836685151", "2949ef836748271195914fef9acf4e46",
+ "524e79bb87ed550e123d00a61df94381", "182222470d7b7a80017521d0261e4474",
+ };
+ static const char* const kTestDigestsLuma10bpp[10] = {
+ "27a49a2131fb6d4dd4b8c34da1b7642e", "4ea9134f6831dd398545c85b2a68e31f",
+ "4e12232a18a2b06e958d7ab6b953faad", "0ede12864ddaced2d8062ffa4225ce24",
+ "5fee492c4a430b2417a64aa4920b69e9", "39af842a3f9370d796e8ef047c0c42a8",
+ "0efbad5f9dc07391ad243232b8df1787", "2bd41882cd82960019aa2b87d5fb1fbc",
+ "1c66629c0c4e7b6f9b0a7a6944fbad50", "2c633a50ead62f8e844a409545f46244",
+ };
+ static const char* const kTestDigestsLuma12bpp[10] = {
+ "1dc9b38a93454a85eb924f25346ae369", "5f9d311ee5384a5a902f8e2d1297319e",
+ "cf1a35878720564c7a741f91eef66565", "47a0608fe0f6f7ccae42a5ca05783cbf",
+ "dbc28da0178e3c18a036c3f2203c300f", "04911d2074e3252119ee2d80426b8c01",
+ "df19ab8103c40b726c842ccf7772208b", "39276967eb16710d98f82068c3eeba41",
+ "b83100f18abb2062d9c9969f07182b86", "b39a69515491329698cf66f6d4fa371f",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetTestDigestChromaU(int bitdepth, int param_index) {
+ static const char* const kTestDigestsChromaU8bpp[10] = {
+ "e56b7bbe9f39bf987770b18aeca59514", "d0b3fd3cf2901dae31b73f20c510d83e",
+ "800c01d58d9fb72136d21ec2bb07899a", "4cd0badba679e8edbcd60a931fce49a1",
+ "cabec236cc17f91f3f08d8cde867aa72", "380a2205cf2d40c6a27152585f61a3b0",
+ "3813526234dc7f90f80f6684772c729a", "97a43a73066d88f9cbd915d56fc9c196",
+ "5b70b27a43dd63b03e23aecd3a935071", "d5cc98685582ffd47a41a97d2e377ac8",
+ };
+ static const char* const kTestDigestsChromaU10bpp[10] = {
+ "9a6d0369ba86317598e65913276dae6d", "2512bdc4c88f21f8185b040b7752d1db",
+ "1e86b779ce6555fcf5bd0ade2af67e73", "5ad463a354ffce522c52b616fb122024",
+ "290d53c22c2143b0882acb887da3fdf1", "54622407d865371d7e70bbf29fdda626",
+ "be306c6a94c55dbd9ef514f0ad4a0011", "904602329b0dec352b3b177b0a2554d2",
+ "58afc9497d968c67fdf2c0cf23b33aa3", "74fee7be6f62724bf901fdd04a733b46",
+ };
+ static const char* const kTestDigestsChromaU12bpp[10] = {
+ "846d608050fe7c19d6cabe2d53cb7821", "2caf4665a26aad50f68497e4b1326417",
+ "ce40f0f8f8c207c7c985464c812fea33", "820de51d07a21da5c00833bab546f1fa",
+ "5e7bedd8933cd274af03babb4dbb94dd", "d137cf584eabea86387460a6d3f62bfe",
+ "f206e0c6ed35b3ab35c6ff37e151e963", "55d87981b7044df225b3b5935185449b",
+ "6a655c8bf4df6af0e80ae6d004a73a25", "6234ae36076cc77161af6e6e3c04449a",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaU8bpp[param_index];
+ case 10:
+ return kTestDigestsChromaU10bpp[param_index];
+ case 12:
+ return kTestDigestsChromaU12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetTestDigestChromaV(int bitdepth, int param_index) {
+ static const char* const kTestDigestsChromaV8bpp[10] = {
+ "7205ed6c07ed27b7b52d871e0559b8fa", "fad033b1482dba0ed2d450b461fa310e",
+ "6bb39798ec6a0f7bda0b0fcb0a555734", "08c19856e10123ae520ccfc63e2fbe7b",
+ "a7695a6b69fba740a50310dfa6cf1c00", "ac2eac2d13fc5b21c4f2995d5abe14b9",
+ "be35cb30062db628a9e1304fca8b75dc", "f5bfc7a910c76bcd5b32c40772170879",
+ "aca07b37d63f978d76df5cd75d0cea5e", "107c7c56d4ec21f346a1a02206301b0d",
+ };
+ static const char* const kTestDigestsChromaV10bpp[10] = {
+ "910724a77710996c90e272f1c1e9ff8e", "d293f861580770a89f1e266931a012ad",
+ "9e4f0c85fb533e51238586f9c3e68b6e", "a5ff4478d9eeb2168262c2e955e17a4f",
+ "fba6b1e8f28e4e90c836d41f28a0c154", "50b9a93f9a1f3845e6903bff9270a3e6",
+ "7b1624c3543badf5fadaee4d1e602e6b", "3be074e4ca0eec5770748b15661aaadd",
+ "639197401032f272d6c30666a2d08f43", "28075dd34246bf9d5e6197b1944f646a",
+ };
+ static const char* const kTestDigestsChromaV12bpp[10] = {
+ "4957ec919c20707d594fa5c2138c2550", "3f07c65bfb42c81768b1f5ad9611d1ce",
+ "665d9547171c99faba95ac81a35c9a0c", "1b5d032e0cefdb4041ad51796de8a45e",
+ "18fa974579a4f1ff8cd7df664fc339d5", "2ffaa4f143495ff73c06a580a97b6321",
+ "4fd1f562bc47a68dbfaf7c566c7c4da6", "4d37c80c9caf110c1d3d20bd1a1875b3",
+ "8ea29759640962613166dc5154837d14", "5ca4c10f42d0906c72ebee90fae6ce7d",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaV8bpp[param_index];
+ case 10:
+ return kTestDigestsChromaV10bpp[param_index];
+ case 12:
+ return kTestDigestsChromaV12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetARTestDigestLuma(int bitdepth, int coeff_lag, int param_index) {
+ static const char* const kTestDigestsLuma8bpp[3][kNumFilmGrainTestParams] = {
+ {"a835127918f93478b45f1ba4d20d81bd", "a835127918f93478b45f1ba4d20d81bd",
+ "e5db4da626e214bb17bcc7ecffa76303", "a835127918f93478b45f1ba4d20d81bd",
+ "a835127918f93478b45f1ba4d20d81bd", "e5db4da626e214bb17bcc7ecffa76303",
+ "a835127918f93478b45f1ba4d20d81bd", "1da62b7233de502123a18546b6c97da2",
+ "1da62b7233de502123a18546b6c97da2", "1da62b7233de502123a18546b6c97da2"},
+ {"11464b880de3ecd6e6189c5c4e7f9b28", "dfe411762e283b5f49bece02ec200951",
+ "5c534d92afdf0a5b53dbe4fe7271929c", "2e1a68a18aca96c31320ba7ceab59be9",
+ "584c0323e6b276cb9acb1a294d462d58", "9571eb8f1cbaa96ea3bf64a820a8d9f0",
+ "305285ff0df87aba3c59e3fc0818697d", "0066d35c8818cf20230114dcd3765a4d",
+ "0066d35c8818cf20230114dcd3765a4d", "16d61b046084ef2636eedc5a737cb6f6"},
+ {"0c9e2cf1b6c3cad0f7668026e8ea0516", "7d094855292d0eded9e0d1b5bab1990b",
+ "fbf28860a5f1285dcc6725a45256a86a", "dccb906904160ccabbd2c9a7797a4bf9",
+ "46f645e17f08a3260b1ae70284e5c5b8", "124fdc90bed11a7320a0cbdee8b94400",
+ "8d2978651dddeaef6282191fa146f0a0", "28b4d5aa33f05b3fb7f9323a11936bdc",
+ "6a8ea684f6736a069e3612d1af6391a8", "2781ea40a63704dbfeb3a1ac5db6f2fc"},
+ };
+
+ static const char* const kTestDigestsLuma10bpp[3][kNumFilmGrainTestParams] = {
+ {"5e6bc8444ece2d38420f51d82238d812", "5e6bc8444ece2d38420f51d82238d812",
+ "2bfaec768794af33d60a9771f971f68d", "5e6bc8444ece2d38420f51d82238d812",
+ "5e6bc8444ece2d38420f51d82238d812", "c880807a368c4e82c23bea6f035ad23f",
+ "5e6bc8444ece2d38420f51d82238d812", "c576667da5286183ec3aab9a76f53a2e",
+ "c576667da5286183ec3aab9a76f53a2e", "c576667da5286183ec3aab9a76f53a2e"},
+ {"095c2dd4d4d52aff9696df9bfdb70062", "983d14afa497060792d472a449a380c7",
+ "c5fdc0f7c594b2b36132cec6f45a79bd", "acff232ac5597c1712213150552281d1",
+ "4dd7341923b1d260092853553b6b6246", "0ca8afd71a4f564ea1ce69c4af14e9ab",
+ "9bc7565e5359d09194fcee28e4bf7b94", "6fea7805458b9d149f238a30e2dc3f13",
+ "6fea7805458b9d149f238a30e2dc3f13", "681dff5fc7a7244ba4e4a582ca7ecb14"},
+ {"cb99352c9c6300e7e825188bb4adaee0", "7e40674de0209bd72f8e9c6e39ee6f7c",
+ "3e475572f6b4ecbb2730fd16751ad7ed", "e6e4c63abc9cb112d9d1f23886cd1415",
+ "1a1c953b175c105c604902877e2bab18", "380a53072530223d4ee622e014ee4bdb",
+ "6137394ea1172fb7ea0cbac237ff1703", "85ab0c813e46f97cb9f42542f44c01ad",
+ "68c8ac462f0e28cb35402c538bee32f1", "0038502ffa4760c8feb6f9abd4de7250"},
+ };
+
+ static const char* const kTestDigestsLuma12bpp[3][kNumFilmGrainTestParams] = {
+ {"d618bbb0e337969c91b1805f39561520", "d618bbb0e337969c91b1805f39561520",
+ "678f6e911591daf9eca4e305dabdb2b3", "d618bbb0e337969c91b1805f39561520",
+ "d618bbb0e337969c91b1805f39561520", "3b26f49612fd587c7360790d40adb5de",
+ "d618bbb0e337969c91b1805f39561520", "33f77d3ff50cfc64c6bc9a896b567377",
+ "33f77d3ff50cfc64c6bc9a896b567377", "33f77d3ff50cfc64c6bc9a896b567377"},
+ {"362fd67050fb7abaf57c43a92d993423", "e014ae0eb9e697281015c38905cc46ef",
+ "82b867e57151dc08afba31eccf5ccf69", "a94ba736cdce7bfa0b550285f59e47a9",
+ "3f1b0b7dd3b10e322254d35e4e185b7c", "7929708e5f017d58c53513cb79b35fda",
+ "6d26d31a091cbe642a7070933bd7de5a", "dc29ac40a994c0a760bfbad0bfc15b3a",
+ "dc29ac40a994c0a760bfbad0bfc15b3a", "399b919db5190a5311ce8d166580827b"},
+ {"6116d1f569f5b568eca4dc1fbf255086", "7e9cf31ea74e8ea99ffd12094ce6cd05",
+ "bb982c4c39e82a333d744defd16f4388", "7c6e584b082dc6b97ed0d967def3993f",
+ "fb234695353058f03c8e128f2f8de130", "9218c6ca67bf6a9237f98aa1ce7acdfd",
+ "d1fb834bbb388ed066c5cbc1c79b5bdf", "d6f630daedc08216fcea12012e7408b5",
+ "dd7fe49299e6f113a98debc7411c8db8", "8b89e45a5101a28c24209ae119eafeb8"},
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[coeff_lag - 1][param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[coeff_lag - 1][param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[coeff_lag - 1][param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag,
+ int subsampling_x, int subsampling_y) {
+ static const char* const kTestDigestsChromaU8bpp[12] = {
+ "11ced66de0eaf55c1ff9bad18d7b8ed7", "0c3b77345dd4ab0915ef53693ab93ce4",
+ "b0645044ba080b3ceb8f299e269377d6", "50590ad5d895f0b4bc6694d878e9cd32",
+ "85e1bf3741100135062f5b4abfe7639b", "76955b70dde61ca5c7d079c501b90906",
+ "3f0995e1397fd9efd9fc46b67f7796b3", "0a0d6c3e4e1649eb101395bc97943a07",
+ "1878855ed8db600ccae1d39abac52ec6", "13ab2b28320ed3ac2b820f08fdfd424d",
+ "f3e95544a86ead5387e3dc4e043fd0f0", "ff8f5d2d97a6689e16a7e4f482f69f0b",
+ };
+
+ static const char* const kTestDigestsChromaU10bpp[12] = {
+ "707f2aa5aa7e77bc6e83ab08287d748d", "0bcf40c7fead9ac3a5d71b4cc1e21549",
+ "0c1df27053e5da7cf1276a122a8f4e8b", "782962f7425eb38923a4f87e7ab319d9",
+ "b4a709ae5967afef55530b9ea8ef0062", "70a971a0b9bf06212d510b396f0f9095",
+ "d033b89d6e31f8b13c83d94c840b7d54", "40bbe804bf3f90cee667d3b275e3c964",
+ "90bb2b9d518b945adcfd1b1807f7d170", "4bc34aa157fe5ad4270c611afa75e878",
+ "e2688d7286cd43fe0a3ea734d2ad0f77", "853193c4981bd882912171061327bdf2",
+ };
+
+ static const char* const kTestDigestsChromaU12bpp[12] = {
+ "04c23b01d01c0e3f3247f3741581b383", "9f8ea1d66e44f6fe93d765ce56b2b0f3",
+ "5dda44b128d6c244963f1e8e17cc1d22", "9dd0a79dd2f772310a95762d445bface",
+ "0dbd40d930e4873d72ea72b9e3d62440", "d7d83c207c6b435a164206d5f457931f",
+ "e8d04f6e63ed63838adff965275a1ff1", "fc09a903e941fcff8bad67a84f705775",
+ "9cd706606a2aa40d0957547756f7abd9", "258b37e7b8f48db77dac7ea24073fe69",
+ "80149b8bb05308da09c1383d8b79d3da", "e993f3bffae53204a1942feb1af42074",
+ };
+
+ assert(!(subsampling_x == 0 && subsampling_y == 1));
+ const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaU8bpp[base_index];
+ case 10:
+ return kTestDigestsChromaU10bpp[base_index];
+ case 12:
+ return kTestDigestsChromaU12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag,
+ int subsampling_x, int subsampling_y) {
+ static const char* const kTestDigestsChromaV8bpp[12] = {
+ "5c2179f3d93be0a0da75d2bb90347c2f", "79b883847d7eaa7890e1d633b8e34353",
+ "90ade818e55808e8cf58c11debb5ddd1", "1d0f2a14bc4df2b2a1abaf8137029f92",
+ "ac753a57ade140dccb50c14f941ae1fc", "d24ab497558f6896f08dc17bcc3c50c1",
+ "3d74436c63920022a95c85b234db4e33", "061c2d53ed84c830f454e395c362cb16",
+ "05d24869d7fb952e332457a114c8b9b7", "fcee31b87a2ada8028c2a975e094856a",
+ "c019e2c475737abcf9c2b2a52845c646", "9cd994baa7021f8bdf1d1c468c1c8e9c",
+ };
+
+ static const char* const kTestDigestsChromaV10bpp[12] = {
+ "bc9e44454a05cac8571c15af5b720e79", "f0374436698d94e879c03331b1f30df4",
+ "4580dd009abd6eeed59485057c55f63e", "7d1f7aecd45302bb461f4467f2770f72",
+ "1f0d003fce6c5fedc147c6112813f43b", "4771a45c2c1a04c375400619d5536035",
+ "df9cf619a78907c0f6e58bc13d7d5546", "dd3715ce65d905f30070a36977c818e0",
+ "32de5800f76e34c128a1d89146b4010b", "db9d7c70c3f69feb68fae04398efc773",
+ "d3d0912e3fdb956fef416a010bd7b4c2", "a2fca8abd9fd38d2eef3c4495d9eff78",
+ };
+
+ static const char* const kTestDigestsChromaV12bpp[12] = {
+ "0d1890335f4464167de22353678ca9c6", "9e6830aba73139407196f1c811f910bc",
+ "6018f2fb76bd648bef0262471cfeba5c", "78e1ae1b790d709cdb8997621cf0fde3",
+ "5b44ae281d7f9db2f17aa3c24b4741dd", "f931d16991669cb16721de87da9b8067",
+ "5580f2aed349d9cabdafb9fc25a57b1c", "86918cd78bf95e6d4405dd050f5890b8",
+ "13c8b314eeebe35fa60b703d94e1b2c1", "13c6fb75cab3f42e0d4ca31e4d068b0e",
+ "bb9ca0bd6f8cd67e44c8ac2803abf5a5", "0da4ea711ffe557bb66577392b6f148b",
+ };
+
+ assert(!(subsampling_x == 0 && subsampling_y == 1));
+ const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaV8bpp[base_index];
+ case 10:
+ return kTestDigestsChromaV10bpp[base_index];
+ case 12:
+ return kTestDigestsChromaV12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) {
+ static const char* const kTestDigestsLuma8bpp[kNumFilmGrainTestParams] = {
+ "c48babd99e5cfcbaa13d8b6e0c12e644", "da4b971d2de19b709e2bc98d2e50caf3",
+ "96c72faac19a79c138afeea8b8ae8c7a", "90a2b9c8304a44d14e83ca51bfd2fe8a",
+ "72bd3aa85c17850acb430afb4183bf1a", "a0acf76349b9efbc9181fc31153d9ef6",
+ "6da74dd631a4ec8b9372c0bbec22e246", "6e11fa230f0e5fbb13084255c22cabf9",
+ "be1d257b762f9880d81680e9325932a2", "37e302075af8130b371de4430e8a22cf",
+ };
+
+ static const char* const kTestDigestsLuma10bpp[kNumFilmGrainTestParams] = {
+ "0a40fd2f261095a6154584a531328142", "9d0c8173a94a0514c769e94b6f254030",
+ "7894e959fdd5545895412e1512c9352d", "6802cad2748cf6db7f66f53807ee46ab",
+ "ea24e962b98351c3d929a8ae41e320e2", "b333dc944274a3a094073889ca6e11d6",
+ "7211d7ac0ff7d11b5ef1538c0d98f43d", "ef9f9cbc101a07da7bfa62637130e331",
+ "85a122e32648fde84b883a1f98947c60", "dee656e3791138285bc5b71e3491a177",
+ };
+
+ static const char* const kTestDigestsLuma12bpp[kNumFilmGrainTestParams] = {
+ "ae359794b5340d073d597117046886ac", "4d4ad3908b4fb0f248a0086537dd6b1e",
+ "672a97e15180cbeeaf76d763992c9f23", "739124d10d16e00a158e833ea92107bc",
+ "4c38c738ff7ffc50adaa4474584d3aae", "ca05ba7e51000a7d10e5cbb2101bbd86",
+ "e207022b916bf03a76ac8742af29853d", "7454bf1859149237ff74f1161156c857",
+ "10fc2a16e663bbc305255b0883cfcd45", "4228abff6899bb33839b579288ab29fe",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag,
+ int subsampling_x,
+ int subsampling_y) {
+ static const char* const kTestDigests8bpp[6] = {
+ "cd14aaa6fc1728290fa75772730a2155", "13ad4551feadccc3a3a9bd5e25878d2a",
+ "ed6ad9532c96ef0d79ff3228c89a429f", "82f307a7f5fc3308c3ebe268b5169e70",
+ "aed793d525b85349a8c2eb6d40e93969", "311c3deb727621a7d4f18e8defb65de7",
+ };
+
+ static const char* const kTestDigests10bpp[6] = {
+ "4fe2fa1e428737de3595be3a097d0203", "80568c3c3b53bdbbd03b820179092dcd",
+ "bc7b73099961a0739c36e027d6d09ea1", "e5331364e5146a6327fd94e1467f59a3",
+ "125bf18b7787e8f0792ea12f9210de0d", "21cf98cbce17eca77dc150cc9be0e0a0",
+ };
+
+ static const char* const kTestDigests12bpp[6] = {
+ "57f8e17078b6e8935252e918a2562636", "556a7b294a99bf1163b7166b4f68357e",
+ "249bee5572cd7d1cc07182c97adc4ba7", "9bf43ae1998c2a5b2e5f4d8236b58747",
+ "477c08fa26499936e5bb03bde097633e", "fe64b7166ff87ea0711ae4f519cadd59",
+ };
+
+ const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag,
+ int subsampling_x, int subsampling_y) {
+ static const char* const kTestDigests8bpp[6] = {
+ "17030fc692e685557a3717f9334af7e8", "d16ea46147183cd7bc36bcfc2f936a5b",
+ "68152958540dbec885f71e3bcd7aa088", "bb43b420f05a122eb4780aca06055ab1",
+ "87567b04fbdf64f391258c0742de266b", "ce87d556048b3de32570faf6729f4010",
+ };
+
+ static const char* const kTestDigests10bpp[6] = {
+ "5b31b29a5e22126a9bf8cd6a01645777", "2bb94a25164117f2ab18dae18e2c6577",
+ "27e57a4ed6f0c9fe0a763a03f44805e8", "481642ab0b07437b76b169aa4eb82123",
+ "656a9ef056b04565bec9ca7e0873c408", "a70fff81ab28d02d99dd4f142699ba39",
+ };
+
+ static const char* const kTestDigests12bpp[6] = {
+ "146f7ceadaf77e7a3c41e191a58c1d3c", "de18526db39630936733e687cdca189e",
+ "165c96ff63bf3136505ab1d239f7ceae", "a102636662547f84e5f6fb6c3e4ef959",
+ "4cb073fcc783c158a95c0b1ce0d27e9f", "3a734c71d4325a7da53e2a6e00f81647",
+ };
+
+ const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetScalingInitTestDigest(int param_index, int bitdepth) {
+ static const char* const kTestDigests8bpp[kNumFilmGrainTestParams] = {
+ "315202ca3bf9c46eac8605e89baffd2a", "640f6408702b07ab7e832e7326cce56f",
+ "f75ee83e3912a3f25949e852d67326cf", "211223f5d6a4b42a8e3c662f921b71c0",
+ "f75ee83e3912a3f25949e852d67326cf", "e7a1de8c5a2cac2145c586ecf1f9051c",
+ "e7a1de8c5a2cac2145c586ecf1f9051c", "276fe5e3b30b2db2a9ff798eb6cb8e00",
+ "ac67f1c3aff2f50ed4b1975bde67ffe3", "8db6145a60d506cc94f07cef8b27c681",
+ };
+
+ static const char* const kTestDigests10bpp[kNumFilmGrainTestParams] = {
+ "c50be59c62b634ff45ddfbe5b978adfc", "7626286109a2a1eaf0a26f6b2bbab9aa",
+ "f2302988140c47a0724fc55ff523b6ec", "5318e33d8a59a526347ffa6a72ba6ebd",
+ "f2302988140c47a0724fc55ff523b6ec", "f435b5fe98e9d8b6c61fa6f457601c2c",
+ "f435b5fe98e9d8b6c61fa6f457601c2c", "ff07a2944dbe094d01e199098764941c",
+ "11b3e256c74cee2b5679f7457793869a", "89fab5c1db09e242d0494d1c696a774a",
+ };
+
+ static const char* const kTestDigests12bpp[kNumFilmGrainTestParams] = {
+ "1554df49a863a851d146213e09d311a4", "84808c3ed3b5495a62c9d2dd9a08cb26",
+ "bb31f083a3bd9ef26587478b8752f280", "34fdfe61d6871e4882e38062a0725c5c",
+ "bb31f083a3bd9ef26587478b8752f280", "e7b8c3e4508ceabe89b78f10a9e160b8",
+ "e7b8c3e4508ceabe89b78f10a9e160b8", "a0ccc9e3d0f0c9d1f08f1249264d92f5",
+ "7992a96883c8a9a35d6ca8961bc4515b", "de906ce2c0fceed6f168215447b21b16",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[param_index];
+ case 10:
+ return kTestDigests10bpp[param_index];
+ case 12:
+ return kTestDigests12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetBlendLumaTestDigest(int bitdepth) {
+ static const char* const kTestDigests[] = {
+ "de35b16c702690b1d311cdd0973835d7",
+ "60e9f24dcaaa0207a8db5ab5f3c66608",
+ "8e7d44b620bb7768459074be6bfbca7b",
+ };
+
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return kTestDigests[(bitdepth - 8) / 2];
+}
+
+const char* GetBlendChromaUTestDigest(int bitdepth,
+ int chroma_scaling_from_luma,
+ int subsampling_x, int subsampling_y) {
+ static const char* const kTestDigests8bpp[6] = {
+ "36ca194734d45e75079baba1f3ec9e9e", "182b388061f59fd3e24ef4581c536e67",
+ "2e7843b4c624f03316c3cbe1cc835859", "39e6d9606915da6a41168fbb006b55e4",
+ "3f44a4e252d4823544ac66a900dc7983", "1860f0831841f262d66b23f6a6b5833b",
+ };
+
+ static const char* const kTestDigests10bpp[6] = {
+ "2054665564f55750c9588b505eb01ac0", "4d8b0e248f8a6bfc72516aa164e76b0b",
+ "7e549800a4f9fff6833bb7738e272baf", "8de6f30dcda99a37b359fd815e62d2f7",
+ "9b7958a2278a16bce2b7bc31fdd811f5", "c5c3c8cccf6a2b4e40b4a412a5bf4f08",
+ };
+
+ static const char* const kTestDigests12bpp[6] = {
+ "8fad0cc641da35e0d2d8f178c7ce8394", "793eb9d2e6b4ea2e3bb08e7068236155",
+ "9156bd85ab9493d8867a174f920bb1e6", "6834319b4c88e3e0c96b6f8d7efd08dd",
+ "c40e492790d3803a734efbc6feca46e2", "d884c3b1e2c21d98844ca7639e0599a5",
+ };
+
+ const int base_index =
+ 3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+const char* GetBlendChromaVTestDigest(int bitdepth,
+ int chroma_scaling_from_luma,
+ int subsampling_x, int subsampling_y) {
+ static const char* const kTestDigests8bpp[6] = {
+ "9a353e4f86d7ebaa980f7f6cfc0995ad", "17589b4039ed49ba16f32db9fae724b7",
+ "76ae8bed48a173b548993b6e1824ff67", "c1458ac9bdfbf0b4d6a175343b17b27b",
+ "fa76d1c8e48957537f26af6a5b54ec14", "313fe3c34568b7f9c5ecb09d419d4ba4",
+ };
+
+ static const char* const kTestDigests10bpp[6] = {
+ "8ab5a8e03f07547260033d6a0b689e3c", "275ede58d311e2f5fd76f222f45a64fc",
+ "ce13916e0f7b02087fd0356534d32770", "165bfc8cda0266936a67fa4ec9b215cb",
+ "ed4382caa936acf1158ff8049d18ffac", "942bdd1344c9182dd7572099fb9372db",
+ };
+
+ static const char* const kTestDigests12bpp[6] = {
+ "70704a1e171a3a70d40b7d0037a75fbc", "62549e2afbf36a1ed405a6574d39c542",
+ "e93889927ab77c6e0767ff071d980c02", "a0c1f6ed78874137710fee7418d80959",
+ "f6283e36a25cb867e30bdf0bfdb2124b", "741c2d48898835b9d9e3bd0b6ac6269a",
+ };
+
+ const int base_index =
+ 3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
+ }
+}
+
+// GetFilmGrainRandomNumber() is only invoked with |bits| equal to 11 or 8. Test
+// both values of |bits|.
+TEST(FilmGrainTest, GetFilmGrainRandomNumber) {
+ uint16_t seed = 51968;
+ const struct {
+ int rand;
+ uint16_t seed;
+ } kExpected11[5] = {
+ {812, 25984}, {406, 12992}, {1227, 39264}, {1637, 52400}, {818, 26200},
+ };
+ for (int i = 0; i < 5; ++i) {
+ int rand = GetFilmGrainRandomNumber(11, &seed);
+ EXPECT_EQ(rand, kExpected11[i].rand) << "i = " << i;
+ EXPECT_EQ(seed, kExpected11[i].seed) << "i = " << i;
+ }
+ const struct {
+ int rand;
+ uint16_t seed;
+ } kExpected8[5] = {
+ {179, 45868}, {89, 22934}, {44, 11467}, {150, 38501}, {75, 19250},
+ };
+ for (int i = 0; i < 5; ++i) {
+ int rand = GetFilmGrainRandomNumber(8, &seed);
+ EXPECT_EQ(rand, kExpected8[i].rand) << "i = " << i;
+ EXPECT_EQ(seed, kExpected8[i].seed) << "i = " << i;
+ }
+}
+
+// In FilmGrainParams, if num_u_points and num_v_points are both 0 and
+// chroma_scaling_from_luma is false, GenerateChromaGrains() should set both
+// the u_grain and v_grain arrays to all zeros.
+TEST(FilmGrainTest, GenerateZeroChromaGrains) {
+ FilmGrainParams film_grain_params = {};
+ film_grain_params.apply_grain = true;
+ film_grain_params.update_grain = true;
+ film_grain_params.chroma_scaling = 8;
+ film_grain_params.auto_regression_shift = 6;
+ film_grain_params.grain_seed = 51968;
+
+ int8_t u_grain[73 * 82];
+ int8_t v_grain[73 * 82];
+ const int chroma_width = 44;
+ const int chroma_height = 38;
+
+ // Initialize u_grain and v_grain with arbitrary nonzero values.
+ memset(u_grain, 1, sizeof(u_grain));
+ memset(v_grain, 2, sizeof(v_grain));
+ for (int y = 0; y < chroma_height; ++y) {
+ for (int x = 0; x < chroma_width; ++x) {
+ EXPECT_NE(u_grain[y * chroma_width + x], 0);
+ EXPECT_NE(v_grain[y * chroma_width + x], 0);
+ }
+ }
+
+ FilmGrain<8>::GenerateChromaGrains(film_grain_params, chroma_width,
+ chroma_height, u_grain, v_grain);
+
+ for (int y = 0; y < chroma_height; ++y) {
+ for (int x = 0; x < chroma_width; ++x) {
+ EXPECT_EQ(u_grain[y * chroma_width + x], 0);
+ EXPECT_EQ(v_grain[y * chroma_width + x], 0);
+ }
+ }
+}
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+class AutoRegressionTestLuma
+ : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ AutoRegressionTestLuma() {
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ const int index = std::get<0>(GetParam()) - 1;
+ base_luma_auto_regression_func_ =
+ dsp->film_grain.luma_auto_regression[index];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_luma_auto_regression_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ }
+ luma_auto_regression_func_ = dsp->film_grain.luma_auto_regression[index];
+ }
+
+ protected:
+ // |compare| determines whether to compare the output blocks from the SIMD
+ // implementation, if used, and the C implementation.
+ // |saturate| determines whether to set the inputs to maximum values. This is
+ // intended primarily as a way to simplify differences in output when
+ // debugging.
+ void TestAutoRegressiveFilterLuma(int coeff_lag, int param_index,
+ int num_runs, bool saturate, bool compare);
+ LumaAutoRegressionFunc luma_auto_regression_func_;
+ LumaAutoRegressionFunc base_luma_auto_regression_func_;
+ GrainType luma_block_buffer_[kLumaBlockSize];
+ GrainType base_luma_block_buffer_[kLumaBlockSize];
+};
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+void AutoRegressionTestLuma<bitdepth>::TestAutoRegressiveFilterLuma(
+ int coeff_lag, int param_index, int num_runs, bool saturate, bool compare) {
+ if (luma_auto_regression_func_ == nullptr) return;
+ // Compare is only needed for NEON tests to compare with C output.
+ if (base_luma_auto_regression_func_ == nullptr && compare) return;
+ FilmGrainParams params = kFilmGrainParams[param_index];
+ params.auto_regression_coeff_lag = coeff_lag;
+ const int grain_max = GetGrainMax<bitdepth>();
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ if (saturate) {
+ luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+ } else {
+ luma_block_buffer_[y * kLumaWidth + x] =
+ std::min(x - (kLumaWidth >> 1), y - (kLumaHeight >> 1)) *
+ (1 << (bitdepth - 8));
+ }
+ }
+ }
+
+ if (saturate) {
+ memset(params.auto_regression_coeff_y, 127,
+ sizeof(params.auto_regression_coeff_y));
+ }
+ if (compare) {
+ memcpy(base_luma_block_buffer_, luma_block_buffer_,
+ sizeof(luma_block_buffer_));
+ }
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ luma_auto_regression_func_(params, luma_block_buffer_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs > 1) {
+ printf("AutoRegressionLuma lag=%d, param_index=%d: %d us\n", coeff_lag,
+ param_index,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ return;
+ }
+ if (compare) {
+ base_luma_auto_regression_func_(params, base_luma_block_buffer_);
+ EXPECT_TRUE(test_utils::CompareBlocks(
+ luma_block_buffer_, base_luma_block_buffer_, kLumaWidth, kLumaHeight,
+ kLumaWidth, kLumaWidth, false));
+ } else {
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("AutoRegressionLuma lag=%d, param_index=%d", coeff_lag,
+ param_index)
+ .c_str(),
+ GetARTestDigestLuma(bitdepth, coeff_lag, param_index),
+ luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+ }
+}
+
+using AutoRegressionTestLuma8bpp = AutoRegressionTestLuma<8>;
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLuma) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLumaSaturated) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, DISABLED_Speed) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1e5,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestLuma10bpp = AutoRegressionTestLuma<10>;
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLuma) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLumaSaturated) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, DISABLED_Speed) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1e5,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestLuma12bpp = AutoRegressionTestLuma<12>;
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLuma) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLumaSaturated) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, DISABLED_Speed) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1e5,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AutoRegressionTestLuma8bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AutoRegressionTestLuma8bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(
+ C, AutoRegressionTestLuma10bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AutoRegressionTestLuma10bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(
+ C, AutoRegressionTestLuma12bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct AutoRegressionChromaTestParam {
+ explicit AutoRegressionChromaTestParam(const std::tuple<int, int>& in)
+ : coeff_lag(std::get<0>(in)) {
+ switch (std::get<1>(in)) {
+ case 0:
+ subsampling_x = 0;
+ subsampling_y = 0;
+ break;
+ case 1:
+ subsampling_x = 1;
+ subsampling_y = 0;
+ break;
+ default:
+ assert(std::get<1>(in) == 2);
+ subsampling_x = 1;
+ subsampling_y = 1;
+ }
+ }
+ const int coeff_lag;
+ int subsampling_x;
+ int subsampling_y;
+};
+
+template <int bitdepth>
+class AutoRegressionTestChroma
+ : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ AutoRegressionTestChroma() {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ // This test suite does not cover num_y_points == 0. This should be covered
+ // in the test of the full synthesis process.
+ base_chroma_auto_regression_func_ =
+ dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_chroma_auto_regression_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ }
+ chroma_auto_regression_func_ =
+ dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+ }
+
+ ~AutoRegressionTestChroma() override = default;
+
+ protected:
+ // |compare| determines whether to compare the output blocks from the SIMD
+ // implementation, if used, and the C implementation.
+ // |saturate| determines whether to set the inputs to maximum values. This is
+ // intended primarily as a way to simplify differences in output when
+ // debugging.
+ void TestAutoRegressiveFilterChroma(int coeff_lag, int subsampling_x,
+ int subsampling_y, int num_runs,
+ bool saturate, bool compare);
+ ChromaAutoRegressionFunc chroma_auto_regression_func_;
+ ChromaAutoRegressionFunc base_chroma_auto_regression_func_;
+ GrainType luma_block_buffer_[kLumaBlockSize];
+ GrainType u_block_buffer_[kChromaBlockSize];
+ GrainType v_block_buffer_[kChromaBlockSize];
+ GrainType base_u_block_buffer_[kChromaBlockSize];
+ GrainType base_v_block_buffer_[kChromaBlockSize];
+};
+
+template <int bitdepth>
+void AutoRegressionTestChroma<bitdepth>::TestAutoRegressiveFilterChroma(
+ int coeff_lag, int subsampling_x, int subsampling_y, int num_runs,
+ bool saturate, bool compare) {
+ if (chroma_auto_regression_func_ == nullptr) return;
+ // Compare is only needed for NEON tests to compare with C output.
+ if (base_chroma_auto_regression_func_ == nullptr && compare) return;
+
+ // This function relies on the first set of sampled params for basics. The
+ // test param generators are used for coverage.
+ FilmGrainParams params = kFilmGrainParams[0];
+ params.auto_regression_coeff_lag = coeff_lag;
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int chroma_width =
+ (subsampling_x != 0) ? kMinChromaWidth : kMaxChromaWidth;
+ const int chroma_height =
+ (subsampling_y != 0) ? kMinChromaHeight : kMaxChromaHeight;
+ if (saturate) {
+ memset(params.auto_regression_coeff_u, 127,
+ sizeof(params.auto_regression_coeff_u));
+ memset(params.auto_regression_coeff_v, 127,
+ sizeof(params.auto_regression_coeff_v));
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+ luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+ u_block_buffer_[y * kLumaWidth + x] = grain_max;
+ v_block_buffer_[y * kLumaWidth + x] = grain_max;
+ }
+ }
+ } else {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ // Allow any valid grain values.
+ const int random_range = grain_max - grain_min + 1;
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+ const int random_y = rnd(random_range);
+ luma_block_buffer_[y * kLumaWidth + x] = random_y + grain_min;
+ const int random_u = rnd(random_range);
+ u_block_buffer_[y * kLumaWidth + x] = random_u + grain_min;
+ const int random_v = rnd(random_range);
+ v_block_buffer_[y * kLumaWidth + x] = random_v + grain_min;
+ }
+ }
+ }
+ if (compare) {
+ memcpy(base_u_block_buffer_, u_block_buffer_, sizeof(u_block_buffer_));
+ memcpy(base_v_block_buffer_, v_block_buffer_, sizeof(v_block_buffer_));
+ }
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+ subsampling_y, u_block_buffer_,
+ v_block_buffer_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs > 1) {
+ printf("AutoRegressionChroma lag=%d, sub_x=%d, sub_y=%d: %d us\n",
+ coeff_lag, subsampling_x, subsampling_y,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ return;
+ }
+ if (compare) {
+ base_chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+ subsampling_y, base_u_block_buffer_,
+ base_v_block_buffer_);
+ EXPECT_TRUE(test_utils::CompareBlocks(u_block_buffer_, base_u_block_buffer_,
+ chroma_width, chroma_height,
+ chroma_width, chroma_width, false));
+ EXPECT_TRUE(test_utils::CompareBlocks(v_block_buffer_, base_v_block_buffer_,
+ chroma_width, chroma_height,
+ chroma_width, chroma_width, false));
+ } else {
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("AutoRegressionChromaU lag=%d, sub_x=%d, sub_y=%d",
+ coeff_lag, subsampling_x, subsampling_y)
+ .c_str(),
+ GetARTestDigestChromaU(bitdepth, coeff_lag, subsampling_x,
+ subsampling_y),
+ u_block_buffer_, sizeof(u_block_buffer_), elapsed_time);
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("AutoRegressionChromaV lag=%d, sub_x=%d, sub_y=%d",
+ coeff_lag, subsampling_x, subsampling_y)
+ .c_str(),
+ GetARTestDigestChromaV(bitdepth, coeff_lag, subsampling_x,
+ subsampling_y),
+ v_block_buffer_, sizeof(v_block_buffer_), elapsed_time);
+ }
+}
+
+using AutoRegressionTestChroma8bpp = AutoRegressionTestChroma<8>;
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChroma) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1,
+ /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChromaSaturated) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, DISABLED_Speed) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(
+ test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+ // Subsampling cuts each dimension of the chroma blocks in half, so run
+ // twice as many times to compensate.
+ 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+ /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestChroma10bpp = AutoRegressionTestChroma<10>;
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChroma) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1,
+ /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChromaSaturated) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, DISABLED_Speed) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(
+ test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+ // Subsampling cuts each dimension of the chroma blocks in half, so run
+ // twice as many times to compensate.
+ 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestChroma12bpp = AutoRegressionTestChroma<12>;
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChroma) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1,
+ /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChromaSaturated) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, DISABLED_Speed) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(
+ test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+ // Subsampling cuts each dimension of the chroma blocks in half, so run
+ // twice as many times to compensate.
+ 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma8bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma10bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma12bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma8bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma10bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#endif // LIBGAV1_ENABLE_NEON
+
+template <int bitdepth>
+class GrainGenerationTest : public testing::TestWithParam<int> {
+ protected:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ void TestGenerateGrainLuma(int param_index, int num_runs);
+
+ GrainType luma_block_buffer_[kLumaBlockSize];
+};
+
+template <int bitdepth>
+void GrainGenerationTest<bitdepth>::TestGenerateGrainLuma(int param_index,
+ int num_runs) {
+ FilmGrainParams params = kFilmGrainParams[param_index];
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ FilmGrain<bitdepth>::GenerateLumaGrain(params, luma_block_buffer_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs == 1) {
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("GenerateGrainLuma param_index=%d", param_index)
+ .c_str(),
+ GetGrainGenerationTestDigestLuma(bitdepth, param_index),
+ luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+ } else {
+ printf("GenerateGrainLuma param_index=%d: %d us\n", param_index,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+}
+
+using GrainGenerationTest8bpp = GrainGenerationTest<8>;
+
+TEST_P(GrainGenerationTest8bpp, GenerateGrainLuma) {
+ TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest8bpp, DISABLED_LumaSpeed) {
+ TestGenerateGrainLuma(GetParam(), 1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using GrainGenerationTest10bpp = GrainGenerationTest<10>;
+
+TEST_P(GrainGenerationTest10bpp, GenerateGrainLuma) {
+ TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest10bpp, DISABLED_LumaSpeed) {
+ TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using GrainGenerationTest12bpp = GrainGenerationTest<12>;
+
+TEST_P(GrainGenerationTest12bpp, GenerateGrainLuma) {
+ TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest12bpp, DISABLED_LumaSpeed) {
+ TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp,
+ testing::Range(0, 10) /* param_index */);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest10bpp,
+ testing::Range(0, 10) /* param_index */);
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest12bpp,
+ testing::Range(0, 10) /* param_index */);
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+// This param type is used for both ConstructStripesTest and
+// ConstructImageTest.
+struct ConstructNoiseTestParam {
+ explicit ConstructNoiseTestParam(const std::tuple<int, int>& in)
+ : overlap_flag(std::get<0>(in)) {
+ switch (std::get<1>(in)) {
+ case 0:
+ subsampling_x = 0;
+ subsampling_y = 0;
+ break;
+ case 1:
+ subsampling_x = 1;
+ subsampling_y = 0;
+ break;
+ default:
+ assert(std::get<1>(in) == 2);
+ subsampling_x = 1;
+ subsampling_y = 1;
+ }
+ }
+ const int overlap_flag;
+ int subsampling_x;
+ int subsampling_y;
+};
+
+template <int bitdepth>
+class ConstructStripesTest
+ : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ ConstructStripesTest() {
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ base_construct_noise_stripes_func_ =
+ dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_construct_noise_stripes_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ }
+ construct_noise_stripes_func_ =
+ dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+ }
+
+ ~ConstructStripesTest() override = default;
+
+ protected:
+ // |compare| determines whether to compare the output blocks from the SIMD
+ // implementation, if used, and the C implementation.
+ // |saturate| determines whether to set the inputs to maximum values. This is
+ // intended primarily as a way to simplify differences in output when
+ // debugging.
+ void TestConstructNoiseStripes(int overlap_flag, int subsampling_x,
+ int subsampling_y, int num_runs, bool saturate,
+ bool compare);
+ ConstructNoiseStripesFunc construct_noise_stripes_func_;
+ ConstructNoiseStripesFunc base_construct_noise_stripes_func_;
+ GrainType grain_buffer_[kLumaBlockSize];
+ Array2DView<GrainType> noise_stripes_;
+ // Owns the memory that noise_stripes_ points to.
+ std::unique_ptr<GrainType[]> stripe_buffer_;
+ Array2DView<GrainType> base_noise_stripes_;
+ // Owns the memory that base_stripe_buffer_ points to.
+ std::unique_ptr<GrainType[]> base_stripe_buffer_;
+};
+
+template <int bitdepth>
+void ConstructStripesTest<bitdepth>::TestConstructNoiseStripes(
+ int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+ bool saturate, bool compare) {
+ if (construct_noise_stripes_func_ == nullptr) return;
+ // Compare is only needed for NEON tests to compare with C output.
+ if (base_construct_noise_stripes_func_ == nullptr && compare) return;
+
+ const int stripe_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+ const int stripe_height = kNoiseStripeHeight;
+ const int stripe_size = stripe_height * stripe_width;
+ const int stripe_buffer_size = stripe_size * kNumTestStripes;
+ if (compare) {
+ base_stripe_buffer_.reset(new (
+ std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+ ASSERT_NE(base_stripe_buffer_, nullptr);
+ base_noise_stripes_.Reset(kNumTestStripes, stripe_size,
+ base_stripe_buffer_.get());
+ }
+ stripe_buffer_.reset(
+ new (std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+ ASSERT_NE(stripe_buffer_, nullptr);
+ noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int grain_min = GetGrainMin<bitdepth>();
+ if (saturate) {
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ grain_buffer_[y * kLumaWidth + x] = grain_max;
+ }
+ }
+ } else {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ // Allow any valid grain values.
+ const int random_range = grain_max - grain_min + 1;
+ for (int y = 0; y < kLumaHeight; ++y) {
+ for (int x = 0; x < kLumaWidth; ++x) {
+ grain_buffer_[y * kLumaWidth + x] = grain_min + rnd(random_range);
+ }
+ }
+ }
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth, kFrameHeight,
+ subsampling_x, subsampling_y,
+ &noise_stripes_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs > 1) {
+ printf(
+ "ConstructNoiseStripes Speed Test for overlap=%d, sub_x=%d, "
+ "sub_y=%d: %d us\n",
+ overlap_flag, subsampling_x, subsampling_y,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ return;
+ }
+ if (compare) {
+ base_construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth,
+ kFrameHeight, subsampling_x,
+ subsampling_y, &base_noise_stripes_);
+
+ constexpr int kCompareWidth = 64;
+ for (int stripe = 0; stripe < kNumTestStripes;) {
+ EXPECT_TRUE(test_utils::CompareBlocks(
+ noise_stripes_[stripe], base_noise_stripes_[stripe], kCompareWidth,
+ stripe_height, stripe_width, stripe_width, /*check_padding=*/false,
+ /*print_diff=*/false));
+ }
+ } else {
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("ConstructNoiseStripes overlap=%d, sub_x=%d, sub_y=%d",
+ overlap_flag, subsampling_x, subsampling_y)
+ .c_str(),
+ GetConstructStripesTestDigest(bitdepth, overlap_flag, subsampling_x,
+ subsampling_y),
+ noise_stripes_[0], stripe_buffer_size, elapsed_time);
+ }
+}
+
+using ConstructStripesTest8bpp = ConstructStripesTest<8>;
+
+TEST_P(ConstructStripesTest8bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructStripesTest8bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+TEST_P(ConstructStripesTest8bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructStripesTest10bpp = ConstructStripesTest<10>;
+
+TEST_P(ConstructStripesTest10bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest10bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest10bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructStripesTest12bpp = ConstructStripesTest<12>;
+
+TEST_P(ConstructStripesTest12bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest12bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest12bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest10bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth>
+class ConstructImageTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+ ConstructImageTest() {
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ base_construct_noise_image_overlap_func_ =
+ dsp->film_grain.construct_noise_image_overlap;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_construct_noise_image_overlap_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ }
+ construct_noise_image_overlap_func_ =
+ dsp->film_grain.construct_noise_image_overlap;
+ }
+
+ ~ConstructImageTest() override = default;
+
+ protected:
+ // |compare| determines whether to compare the output blocks from the SIMD
+ // implementation, if used, and the C implementation.
+ // |saturate| determines whether to set the inputs to maximum values. This is
+ // intended primarily as a way to simplify differences in output when
+ // debugging.
+ void TestConstructNoiseImage(int overlap_flag, int subsampling_x,
+ int subsampling_y, int num_runs, bool saturate,
+ bool compare);
+ ConstructNoiseImageOverlapFunc construct_noise_image_overlap_func_;
+ ConstructNoiseImageOverlapFunc base_construct_noise_image_overlap_func_;
+ Array2DView<GrainType> noise_stripes_;
+ // Owns the memory that noise_stripes_ points to.
+ std::unique_ptr<GrainType[]> stripe_buffer_;
+ Array2D<GrainType> noise_image_;
+ Array2D<GrainType> base_noise_image_;
+};
+
+template <int bitdepth>
+void ConstructImageTest<bitdepth>::TestConstructNoiseImage(
+ int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+ bool saturate, bool compare) {
+ if (construct_noise_image_overlap_func_ == nullptr) return;
+ // Compare is only needed for NEON tests to compare with C output.
+ if (base_construct_noise_image_overlap_func_ == nullptr && compare) return;
+
+ const int image_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+ const int image_height = ((kFrameHeight + subsampling_y) >> subsampling_y);
+ const int stripe_height =
+ ((kNoiseStripeHeight + subsampling_y) >> subsampling_y);
+ const int image_stride = image_width + kNoiseImagePadding;
+ const int stripe_size = stripe_height * image_width;
+ if (compare) {
+ ASSERT_TRUE(base_noise_image_.Reset(image_height, image_stride,
+ /*zero_initialize=*/false));
+ }
+ ASSERT_TRUE(noise_image_.Reset(image_height, image_stride,
+ /*zero_initialize=*/false));
+ // Stride between stripe rows is |image_width|. Padding is only at the
+ // end of the final row of the final stripe to protect from overreads.
+ stripe_buffer_.reset(
+ new (std::nothrow)
+ GrainType[kNumTestStripes * stripe_size + kNoiseStripePadding]);
+ ASSERT_NE(stripe_buffer_, nullptr);
+ noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int grain_min = GetGrainMin<bitdepth>();
+ if (saturate) {
+ for (int i = 0; i < stripe_size; ++i) {
+ noise_stripes_[0][i] = grain_max;
+ }
+ for (int stripe = 1; stripe < kNumTestStripes; ++stripe) {
+ memcpy(noise_stripes_[stripe], noise_stripes_[0],
+ stripe_size * sizeof(noise_stripes_[0][0]));
+ }
+ } else {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ // Allow any valid grain values.
+ const int random_range = grain_max - grain_min + 1;
+ for (int stripe = 0; stripe < kNumTestStripes; ++stripe) {
+ // Assign all allocated memory for this stripe.
+ for (int i = 0; i < stripe_height; ++i) {
+ for (int x = 0; x < image_width; ++x) {
+ noise_stripes_[stripe][i * image_width + x] =
+ grain_min + rnd(random_range);
+ }
+ }
+ }
+ }
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ FilmGrain<bitdepth>::ConstructNoiseImage(
+ &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+ subsampling_y, overlap_flag << (1 - subsampling_y), &noise_image_);
+ if (overlap_flag == 1) {
+ construct_noise_image_overlap_func_(&noise_stripes_, kFrameWidth,
+ kFrameHeight, subsampling_x,
+ subsampling_y, &noise_image_);
+ }
+ }
+
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs > 1) {
+ printf(
+ "ConstructNoiseImage Speed Test for overlap=%d, sub_x=%d, "
+ "sub_y=%d: %d us\n",
+ overlap_flag, subsampling_x, subsampling_y,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ return;
+ }
+ if (compare) {
+ FilmGrain<bitdepth>::ConstructNoiseImage(
+ &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+ subsampling_y, overlap_flag << (1 - subsampling_y), &base_noise_image_);
+ if (overlap_flag == 1) {
+ base_construct_noise_image_overlap_func_(
+ &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+ subsampling_y, &base_noise_image_);
+ }
+ constexpr int kCompareWidth = 72;
+ constexpr int kCompareHeight = 72;
+ EXPECT_TRUE(test_utils::CompareBlocks(
+ noise_image_[0], base_noise_image_[0], kCompareWidth, kCompareHeight,
+ image_stride, image_stride, /*check_padding=*/false,
+ /*print_diff=*/false));
+ } else {
+ printf("BD%d \"%s\",\n", bitdepth,
+ test_utils::GetMd5Sum(noise_image_[0], image_width, image_height,
+ image_stride)
+ .c_str());
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("ConstructNoiseImage overlap=%d, sub_x=%d, sub_y=%d",
+ overlap_flag, subsampling_x, subsampling_y)
+ .c_str(),
+ GetConstructImageTestDigest(bitdepth, overlap_flag, subsampling_x,
+ subsampling_y),
+ noise_image_[0], image_width, image_height, image_stride, elapsed_time);
+ }
+}
+
+using ConstructImageTest8bpp = ConstructImageTest<8>;
+
+TEST_P(ConstructImageTest8bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest8bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest8bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructImageTest10bpp = ConstructImageTest<10>;
+
+TEST_P(ConstructImageTest10bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest10bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest10bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructImageTest12bpp = ConstructImageTest<12>;
+
+TEST_P(ConstructImageTest12bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest12bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest12bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConstructImageTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest10bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth>
+class ScalingLookupTableTest : public testing::TestWithParam<int> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ ScalingLookupTableTest() {
+ test_utils::ResetDspTable(bitdepth);
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ }
+ initialize_func_ = dsp->film_grain.initialize_scaling_lut;
+ }
+ ~ScalingLookupTableTest() override = default;
+
+ protected:
+ void TestSpeed(int num_runs);
+ void ZeroPoints();
+
+ private:
+ static constexpr int kScalingLutBufferLength =
+ (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+ dsp::InitializeScalingLutFunc initialize_func_;
+ int16_t scaling_lut_[kScalingLutBufferLength];
+};
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::TestSpeed(int num_runs) {
+ if (initialize_func_ == nullptr) return;
+ const int param_index = GetParam();
+ const FilmGrainParams& params = kFilmGrainParams[param_index];
+ const absl::Time start = absl::Now();
+ Memset(scaling_lut_, 0, kScalingLutBufferLength);
+ for (int i = 0; i < num_runs; ++i) {
+ initialize_func_(params.num_y_points, params.point_y_value,
+ params.point_y_scaling, scaling_lut_,
+ kScalingLutBufferLength);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (num_runs > 1) {
+ printf("InitializeScalingLut: %d us\n",
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ return;
+ }
+ test_utils::CheckMd5Digest(
+ "FilmGrain",
+ absl::StrFormat("InitializeScalingLut for param set: %d", param_index)
+ .c_str(),
+ GetScalingInitTestDigest(param_index, bitdepth), scaling_lut_,
+ (sizeof(scaling_lut_[0]) * kScalingLookupTableSize) << (bitdepth - 8),
+ elapsed_time);
+}
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::ZeroPoints() {
+ if (initialize_func_ == nullptr) return;
+ const int param_index = GetParam();
+ const FilmGrainParams& params = kFilmGrainParams[param_index];
+ initialize_func_(0, params.point_y_value, params.point_y_scaling,
+ scaling_lut_, kScalingLookupTableSize);
+ for (int i = 0; i < kScalingLookupTableSize; ++i) {
+ ASSERT_EQ(scaling_lut_[i], 0);
+ }
+}
+
+using ScalingLookupTableTest8bpp = ScalingLookupTableTest<8>;
+
+TEST_P(ScalingLookupTableTest8bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest8bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest8bpp, DISABLED_Speed) {
+ TestSpeed(/*num_runs=*/1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ScalingLookupTableTest10bpp = ScalingLookupTableTest<10>;
+
+TEST_P(ScalingLookupTableTest10bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest10bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest10bpp, DISABLED_Speed) {
+ TestSpeed(/*num_runs=*/1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ScalingLookupTableTest12bpp = ScalingLookupTableTest<12>;
+
+TEST_P(ScalingLookupTableTest12bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest12bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest12bpp, DISABLED_Speed) {
+ TestSpeed(/*num_runs=*/1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest8bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest8bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest10bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest10bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest12bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+struct BlendNoiseTestParam {
+ explicit BlendNoiseTestParam(const std::tuple<int, int>& in)
+ : chroma_scaling_from_luma(std::get<0>(in)) {
+ switch (std::get<1>(in)) {
+ case 0:
+ subsampling_x = 0;
+ subsampling_y = 0;
+ break;
+ case 1:
+ subsampling_x = 1;
+ subsampling_y = 0;
+ break;
+ default:
+ assert(std::get<1>(in) == 2);
+ subsampling_x = 1;
+ subsampling_y = 1;
+ }
+ }
+ const int chroma_scaling_from_luma;
+ int subsampling_x;
+ int subsampling_y;
+};
+
+template <int bitdepth, typename Pixel>
+class BlendNoiseTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ using GrainType =
+ typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+ ~BlendNoiseTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ FilmGrainInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ FilmGrainInit_SSE4_1();
+ }
+ const BlendNoiseTestParam test_param(GetParam());
+ chroma_scaling_from_luma_ = test_param.chroma_scaling_from_luma;
+ blend_luma_func_ = dsp->film_grain.blend_noise_luma;
+ blend_chroma_func_ =
+ dsp->film_grain.blend_noise_chroma[chroma_scaling_from_luma_];
+ subsampling_x_ = test_param.subsampling_x;
+ subsampling_y_ = test_param.subsampling_y;
+
+ uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+ uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+ uv_stride_ = uv_width_ * sizeof(Pixel);
+ y_stride_ = width_ * sizeof(Pixel);
+ const size_t buffer_size =
+ sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_ +
+ 3 * kBorderPixelsFilmGrain);
+ source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+ memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+ dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+ memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+ source_plane_y_ = source_buffer_.get();
+ source_plane_u_ =
+ source_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+ source_plane_v_ =
+ source_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+ dest_plane_y_ = dest_buffer_.get();
+ dest_plane_u_ =
+ dest_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+ dest_plane_v_ =
+ dest_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+ }
+
+ void TestSpeed(int num_runs);
+
+ private:
+ static constexpr int kScalingLutBufferLength =
+ (kScalingLookupTableSize + kScalingLookupTablePadding) << 2;
+
+ void ConvertScalingLut10bpp(int16_t* scaling_lut_10bpp,
+ const int16_t* src_scaling_lut);
+ dsp::BlendNoiseWithImageLumaFunc blend_luma_func_;
+ dsp::BlendNoiseWithImageChromaFunc blend_chroma_func_;
+
+ const int width_ = 1921;
+ const int height_ = 1081;
+ int chroma_scaling_from_luma_ = 0;
+ int subsampling_x_ = 0;
+ int subsampling_y_ = 0;
+ int uv_width_ = 0;
+ int uv_height_ = 0;
+ int uv_stride_ = 0;
+ int y_stride_ = 0;
+ // This holds the data that |source_plane_y_|, |source_plane_u_|, and
+ // |source_plane_v_| point to.
+ std::unique_ptr<uint8_t[]> source_buffer_;
+ // This holds the data that |dest_plane_y_|, |dest_plane_u_|, and
+ // |dest_plane_v_| point to.
+ std::unique_ptr<uint8_t[]> dest_buffer_;
+ uint8_t* source_plane_y_ = nullptr;
+ uint8_t* source_plane_u_ = nullptr;
+ uint8_t* source_plane_v_ = nullptr;
+ uint8_t* dest_plane_y_ = nullptr;
+ uint8_t* dest_plane_u_ = nullptr;
+ uint8_t* dest_plane_v_ = nullptr;
+ Array2D<GrainType> noise_image_[kMaxPlanes];
+ int16_t scaling_lut_10bpp_y_[kScalingLutBufferLength];
+ int16_t scaling_lut_10bpp_u_[kScalingLutBufferLength];
+ int16_t scaling_lut_10bpp_v_[kScalingLutBufferLength];
+};
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::ConvertScalingLut10bpp(
+ int16_t* scaling_lut_10bpp, const int16_t* src_scaling_lut) {
+ for (int i = 0; i < kScalingLookupTableSize - 1; ++i) {
+ const int x_base = i << 2;
+ const int start = src_scaling_lut[i];
+ const int end_index = std::min(i + 1, kScalingLookupTableSize - 1);
+ const int end = src_scaling_lut[end_index];
+ const int delta = end - start;
+ scaling_lut_10bpp[x_base] = start;
+ scaling_lut_10bpp[x_base + 1] = start + RightShiftWithRounding(delta, 2);
+ scaling_lut_10bpp[x_base + 2] =
+ start + RightShiftWithRounding(2 * delta, 2);
+ scaling_lut_10bpp[x_base + 3] =
+ start + RightShiftWithRounding(3 * delta, 2);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+ if (blend_chroma_func_ == nullptr || blend_luma_func_ == nullptr) return;
+ // Allow optimized code to read into the border without generating MSan
+ // warnings. This matches the behavior in FilmGrain::AllocateNoiseImage().
+ constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
+ ASSERT_TRUE(noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+ zero_initialize));
+ ASSERT_TRUE(noise_image_[kPlaneU].Reset(
+ uv_height_, uv_width_ + kNoiseImagePadding, zero_initialize));
+ ASSERT_TRUE(noise_image_[kPlaneV].Reset(
+ uv_height_, uv_width_ + kNoiseImagePadding, zero_initialize));
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ // Allow any valid grain values.
+ const int grain_max = GetGrainMax<bitdepth>();
+ const int grain_min = GetGrainMin<bitdepth>();
+ const int random_range = grain_max - grain_min + 1;
+ auto* src_y = reinterpret_cast<Pixel*>(source_plane_y_);
+ auto* src_u = reinterpret_cast<Pixel*>(source_plane_u_);
+ auto* src_v = reinterpret_cast<Pixel*>(source_plane_v_);
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ const int random_source_y = rnd(random_range);
+ // Populating the luma source ensures the lookup table is tested. Chroma
+ // planes are given identical values. Giving them different values would
+ // artificially differentiate the outputs. It's important that the test
+ // expect that different outputs are caused by the different scaling
+ // lookup tables, rather than by different inputs.
+ const int uv_y_pos = y >> subsampling_y_;
+ const int uv_x_pos = x >> subsampling_x_;
+ src_y[y * width_ + x] = random_source_y;
+ src_u[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+ src_v[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+ const int random_y = rnd(random_range);
+ noise_image_[kPlaneY][y][x] = random_y + grain_min;
+ const int random_u = rnd(random_range);
+ noise_image_[kPlaneU][uv_y_pos][uv_x_pos] = random_u + grain_min;
+ const int random_v = rnd(random_range);
+ noise_image_[kPlaneV][uv_y_pos][uv_x_pos] = random_v + grain_min;
+ }
+ }
+ static constexpr int16_t kTestScalingLutY[kScalingLookupTableSize] = {
+ 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73,
+ 75, 76, 77, 79, 80, 81, 83, 84, 86, 87, 88, 90, 91, 92, 92,
+ 93, 93, 94, 95, 95, 96, 97, 97, 98, 98, 99, 99, 99, 99, 98,
+ 98, 98, 98, 98, 98, 98, 97, 97, 97, 97, 97, 97, 97, 97, 97,
+ 97, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+ 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+ 102, 102,
+ };
+ static constexpr int16_t kTestScalingLutU[kScalingLookupTableSize] = {
+ 30, 42, 53, 65, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
+ 75, 76, 78, 79, 81, 82, 83, 85, 86, 88, 89, 91, 92, 93, 93,
+ 94, 94, 95, 95, 96, 96, 97, 97, 98, 98, 99, 99, 99, 99, 99,
+ 99, 99, 99, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
+ 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+ 98, 98, 98, 98, 98, 98, 98, 97, 97, 97, 97, 97, 97, 97, 97,
+ 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97,
+ 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96,
+ 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 95,
+ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
+ 95, 95,
+ };
+ static constexpr int16_t kTestScalingLutV[kScalingLookupTableSize] = {
+ 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74,
+ 75, 75, 78, 79, 81, 82, 83, 85, 86, 88, 89, 91, 92, 93, 93,
+ 94, 94, 95, 95, 96, 96, 97, 97, 98, 98, 99, 99, 99, 99, 98,
+ 98, 98, 98, 98, 98, 98, 97, 97, 97, 97, 97, 97, 97, 97, 97,
+ 97, 97, 97, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+ 98, 98, 98, 98, 98, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+ 100, 100, 100, 100, 100, 100, 100, 100, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+ 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150,
+ 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
+ 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+ 255, 255,
+ };
+
+ if (bitdepth == 10) {
+ for (int i = 0; i < kScalingLutBufferLength; ++i) {
+ ConvertScalingLut10bpp(scaling_lut_10bpp_y_, kTestScalingLutY);
+ ConvertScalingLut10bpp(scaling_lut_10bpp_u_, kTestScalingLutU);
+ ConvertScalingLut10bpp(scaling_lut_10bpp_v_, kTestScalingLutV);
+ }
+ }
+ const FilmGrainParams& params = kFilmGrainParams[0];
+ const int min_value = 16 << (bitdepth - 8);
+ const int max_value = 235 << (bitdepth - 8);
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ if (chroma_scaling_from_luma_) {
+ blend_chroma_func_(
+ kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+ /*start_height=*/0, subsampling_x_, subsampling_y_,
+ (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+ source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+ dest_plane_u_, uv_stride_);
+ blend_chroma_func_(
+ kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+ /*start_height=*/0, subsampling_x_, subsampling_y_,
+ (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+ source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+ dest_plane_v_, uv_stride_);
+ } else {
+ blend_chroma_func_(
+ kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+ /*start_height=*/0, subsampling_x_, subsampling_y_,
+ (bitdepth == 10) ? scaling_lut_10bpp_u_ : kTestScalingLutU,
+ source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+ dest_plane_u_, uv_stride_);
+ blend_chroma_func_(
+ kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+ /*start_height=*/0, subsampling_x_, subsampling_y_,
+ (bitdepth == 10) ? scaling_lut_10bpp_v_ : kTestScalingLutV,
+ source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+ dest_plane_v_, uv_stride_);
+ }
+ blend_luma_func_(noise_image_, min_value, max_value, params.chroma_scaling,
+ width_, height_, /*start_height=*/0,
+ (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+ source_plane_y_, y_stride_, dest_plane_y_, y_stride_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ const char* digest_luma = GetBlendLumaTestDigest(bitdepth);
+ printf("YBD%d \"%s\",\n", bitdepth,
+ test_utils::GetMd5Sum(dest_plane_y_, y_stride_ * height_).c_str());
+ printf("UBD%d \"%s\",\n", bitdepth,
+ test_utils::GetMd5Sum(dest_plane_u_, uv_stride_ * uv_height_).c_str());
+ printf("VBD%d \"%s\",\n", bitdepth,
+ test_utils::GetMd5Sum(dest_plane_v_, uv_stride_ * uv_height_).c_str());
+ test_utils::CheckMd5Digest(
+ "BlendNoiseWithImage",
+ absl::StrFormat("Luma cfl=%d, sub_x=%d, sub_y=%d",
+ chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+ .c_str(),
+ digest_luma, dest_plane_y_, y_stride_ * height_, elapsed_time);
+ const char* digest_chroma_u = GetBlendChromaUTestDigest(
+ bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+ test_utils::CheckMd5Digest(
+ "BlendNoiseWithImage",
+ absl::StrFormat("ChromaU cfl=%d, sub_x=%d, sub_y=%d",
+ chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+ .c_str(),
+ digest_chroma_u, dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+ const char* digest_chroma_v = GetBlendChromaVTestDigest(
+ bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+ test_utils::CheckMd5Digest(
+ "BlendNoiseWithImage",
+ absl::StrFormat("ChromaV cfl=%d, sub_x=%d, sub_y=%d",
+ chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+ .c_str(),
+ digest_chroma_v, dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+}
+
+using BlendNoiseTest8bpp = BlendNoiseTest<8, uint8_t>;
+
+TEST_P(BlendNoiseTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest8bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using BlendNoiseTest10bpp = BlendNoiseTest<10, uint16_t>;
+
+TEST_P(BlendNoiseTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest10bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest10bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest10bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using BlendNoiseTest12bpp = BlendNoiseTest<12, uint16_t>;
+
+TEST_P(BlendNoiseTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+template <int bitdepth, typename Pixel>
+class FilmGrainSpeedTest : public testing::TestWithParam<int> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ ~FilmGrainSpeedTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ FilmGrainInit_C();
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+ FilmGrainInit_NEON();
+#endif
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ FilmGrainInit_SSE4_1();
+ }
+ uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+ uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+ uv_stride_ = uv_width_ * sizeof(Pixel);
+ y_stride_ = width_ * sizeof(Pixel);
+ const size_t buffer_size =
+ sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_);
+ source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+ memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+ dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+ memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+ source_plane_y_ = source_buffer_.get();
+ source_plane_u_ = source_plane_y_ + y_stride_ * height_;
+ source_plane_v_ = source_plane_u_ + uv_stride_ * uv_height_;
+ dest_plane_y_ = dest_buffer_.get();
+ dest_plane_u_ = dest_plane_y_ + y_stride_ * height_;
+ dest_plane_v_ = dest_plane_u_ + uv_stride_ * uv_height_;
+ const int num_threads = GetParam();
+ thread_pool_ = ThreadPool::Create(num_threads);
+ }
+
+ void TestSpeed(int num_runs);
+
+ private:
+ const int width_ = 1920;
+ const int height_ = 1080;
+ const int subsampling_x_ = 1;
+ const int subsampling_y_ = 1;
+ int uv_width_ = 0;
+ int uv_height_ = 0;
+ int uv_stride_ = 0;
+ int y_stride_ = 0;
+ std::unique_ptr<uint8_t[]> source_buffer_;
+ std::unique_ptr<uint8_t[]> dest_buffer_;
+ const uint8_t* source_plane_y_ = nullptr;
+ const uint8_t* source_plane_u_ = nullptr;
+ const uint8_t* source_plane_v_ = nullptr;
+ uint8_t* dest_plane_y_ = nullptr;
+ uint8_t* dest_plane_u_ = nullptr;
+ uint8_t* dest_plane_v_ = nullptr;
+ std::unique_ptr<ThreadPool> thread_pool_;
+};
+
+// Each run of the speed test adds film grain noise to 10 dummy frames. The
+// film grain parameters for the 10 frames were generated with aomenc.
+template <int bitdepth, typename Pixel>
+void FilmGrainSpeedTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+ const dsp::Dsp* dsp = GetDspTable(bitdepth);
+ if (dsp->film_grain.blend_noise_chroma[0] == nullptr ||
+ dsp->film_grain.blend_noise_luma == nullptr) {
+ return;
+ }
+ for (int k = 0; k < kNumFilmGrainTestParams; ++k) {
+ const FilmGrainParams& params = kFilmGrainParams[k];
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ FilmGrain<bitdepth> film_grain(params, /*is_monochrome=*/false,
+ /*color_matrix_is_identity=*/false,
+ subsampling_x_, subsampling_y_, width_,
+ height_, thread_pool_.get());
+ EXPECT_TRUE(film_grain.AddNoise(
+ source_plane_y_, y_stride_, source_plane_u_, source_plane_v_,
+ uv_stride_, dest_plane_y_, y_stride_, dest_plane_u_, dest_plane_v_,
+ uv_stride_));
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ const char* digest_luma = GetTestDigestLuma(bitdepth, k);
+ test_utils::CheckMd5Digest(
+ "FilmGrainSynthesisLuma",
+ absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_luma,
+ dest_plane_y_, y_stride_ * height_, elapsed_time);
+ const char* digest_chroma_u = GetTestDigestChromaU(bitdepth, k);
+ test_utils::CheckMd5Digest(
+ "FilmGrainSynthesisChromaU",
+ absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_u,
+ dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+ const char* digest_chroma_v = GetTestDigestChromaV(bitdepth, k);
+ test_utils::CheckMd5Digest(
+ "FilmGrainSynthesisChromaV",
+ absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_v,
+ dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+ }
+}
+
+using FilmGrainSpeedTest8bpp = FilmGrainSpeedTest<8, uint8_t>;
+
+TEST_P(FilmGrainSpeedTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest8bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest8bpp,
+ testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest8bpp,
+ testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilmGrainSpeedTest10bpp = FilmGrainSpeedTest<10, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest10bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest10bpp,
+ testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest10bpp,
+ testing::Values(0, 3, 8));
+#endif
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilmGrainSpeedTest12bpp = FilmGrainSpeedTest<12, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest12bpp, testing::Values(0, 3, 8));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+} // namespace film_grain
+} // namespace dsp
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/frame_buffer.h"
+
+#include <cstdint>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBufferInfo* info) {
+ switch (bitdepth) {
+ case 8:
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+#endif
+ break;
+ default:
+ return kLibgav1StatusInvalidArgument;
+ }
+ switch (image_format) {
+ case kLibgav1ImageFormatYuv420:
+ case kLibgav1ImageFormatYuv422:
+ case kLibgav1ImageFormatYuv444:
+ case kLibgav1ImageFormatMonochrome400:
+ break;
+ default:
+ return kLibgav1StatusInvalidArgument;
+ }
+ // All int arguments must be nonnegative. Borders must be a multiple of 2.
+ // |stride_alignment| must be a power of 2.
+ if ((width | height | left_border | right_border | top_border |
+ bottom_border | stride_alignment) < 0 ||
+ ((left_border | right_border | top_border | bottom_border) & 1) != 0 ||
+ (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) {
+ return kLibgav1StatusInvalidArgument;
+ }
+
+ bool is_monochrome;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+ libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x,
+ &subsampling_y);
+
+ // Calculate y_stride (in bytes). It is padded to a multiple of
+ // |stride_alignment| bytes.
+ int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+ y_stride = libgav1::Align(y_stride, stride_alignment);
+ // Size of the Y buffer in bytes.
+ const uint64_t y_buffer_size =
+ (height + top_border + bottom_border) * static_cast<uint64_t>(y_stride) +
+ (stride_alignment - 1);
+
+ const int uv_width =
+ is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x);
+ const int uv_height =
+ is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y);
+ const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+ const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+ const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+ const int uv_bottom_border =
+ is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+ // Calculate uv_stride (in bytes). It is padded to a multiple of
+ // |stride_alignment| bytes.
+ int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+ uv_stride = libgav1::Align(uv_stride, stride_alignment);
+ // Size of the U or V buffer in bytes.
+ const uint64_t uv_buffer_size =
+ is_monochrome ? 0
+ : (uv_height + uv_top_border + uv_bottom_border) *
+ static_cast<uint64_t>(uv_stride) +
+ (stride_alignment - 1);
+
+ // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t.
+ if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) {
+ return kLibgav1StatusInvalidArgument;
+ }
+
+ int left_border_bytes = left_border;
+ int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ left_border_bytes *= sizeof(uint16_t);
+ uv_left_border_bytes *= sizeof(uint16_t);
+ }
+#endif
+
+ info->y_stride = y_stride;
+ info->uv_stride = uv_stride;
+ info->y_buffer_size = static_cast<size_t>(y_buffer_size);
+ info->uv_buffer_size = static_cast<size_t>(uv_buffer_size);
+ info->y_plane_offset = top_border * y_stride + left_border_bytes;
+ info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes;
+ info->stride_alignment = stride_alignment;
+ return kLibgav1StatusOk;
+}
+
+Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info,
+ uint8_t* y_buffer, uint8_t* u_buffer,
+ uint8_t* v_buffer,
+ void* buffer_private_data,
+ Libgav1FrameBuffer* frame_buffer) {
+ if (info == nullptr ||
+ (info->uv_buffer_size == 0 &&
+ (u_buffer != nullptr || v_buffer != nullptr)) ||
+ frame_buffer == nullptr) {
+ return kLibgav1StatusInvalidArgument;
+ }
+ if (y_buffer == nullptr || (info->uv_buffer_size != 0 &&
+ (u_buffer == nullptr || v_buffer == nullptr))) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset,
+ info->stride_alignment);
+ frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset,
+ info->stride_alignment);
+ frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset,
+ info->stride_alignment);
+ frame_buffer->stride[0] = info->y_stride;
+ frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride;
+ frame_buffer->private_data = buffer_private_data;
+ return kLibgav1StatusOk;
+}
+
+} // extern "C"
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// The following table is from Section 6.4.2 of the spec.
+//
+// subsampling_x subsampling_y mono_chrome Description
+// -----------------------------------------------------------
+// 0 0 0 YUV 4:4:4
+// 1 0 0 YUV 4:2:2
+// 1 1 0 YUV 4:2:0
+// 1 1 1 Monochrome 4:0:0
+
+inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome,
+ int8_t subsampling_x,
+ int8_t subsampling_y) {
+ Libgav1ImageFormat image_format;
+ if (subsampling_x == 0) {
+ assert(subsampling_y == 0 && !is_monochrome);
+ image_format = kLibgav1ImageFormatYuv444;
+ } else if (subsampling_y == 0) {
+ assert(!is_monochrome);
+ image_format = kLibgav1ImageFormatYuv422;
+ } else if (!is_monochrome) {
+ image_format = kLibgav1ImageFormatYuv420;
+ } else {
+ image_format = kLibgav1ImageFormatMonochrome400;
+ }
+ return image_format;
+}
+
+inline void DecomposeImageFormat(Libgav1ImageFormat image_format,
+ bool* is_monochrome, int8_t* subsampling_x,
+ int8_t* subsampling_y) {
+ *is_monochrome = false;
+ *subsampling_x = 1;
+ *subsampling_y = 1;
+ switch (image_format) {
+ case kLibgav1ImageFormatYuv420:
+ break;
+ case kLibgav1ImageFormatYuv422:
+ *subsampling_y = 0;
+ break;
+ case kLibgav1ImageFormatYuv444:
+ *subsampling_x = *subsampling_y = 0;
+ break;
+ default:
+ assert(image_format == kLibgav1ImageFormatMonochrome400);
+ *is_monochrome = true;
+ break;
+ }
+}
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+
+#include <array>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/loop_restoration_info.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/threading_strategy.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+ std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
+// Buffer to facilitate decoding a frame. This struct is used only within
+// DecoderImpl::DecodeTiles().
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context and the TileScratchBufferPool member
+// tile_scratch_buffer_pool.
+struct FrameScratchBuffer : public MaxAlignedAllocable {
+ LoopRestorationInfo loop_restoration_info;
+ Array2D<int8_t> cdef_index;
+ // Encodes the block skip information as a bitmask for the entire frame which
+ // will be used by the cdef process.
+ //
+ // * The size of this array is rows4x4 / 2 * column4x4 / 16.
+ // * Each row of the bitmasks array (cdef_skip) stores the bitmask for 2 rows
+ // of 4x4 blocks.
+ // * Each entry in the row will store the skip information for 16 4x4 blocks
+ // (8 bits).
+ // * If any of the four 4x4 blocks in the 8x8 block is not a skip block, then
+ // the corresponding bit (as described below) will be set to 1.
+ // * For the 4x4 block at column4x4 the bit index is (column4x4 >> 1).
+ Array2D<uint8_t> cdef_skip;
+ Array2D<TransformSize> inter_transform_sizes;
+ BlockParametersHolder block_parameters_holder;
+ TemporalMotionField motion_field;
+ SymbolDecoderContext symbol_decoder_context;
+ std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
+ // Buffer used to store the cdef borders. This buffer will store 4 rows for
+ // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+ // indices of the rows that are stored are specified in |kCdefBorderRows|.
+ YuvBuffer cdef_border;
+ AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
+ // Buffer used to temporarily store the input row for applying SuperRes.
+ YuvBuffer superres_line_buffer;
+ // Buffer used to store the loop restoration borders. This buffer will store 4
+ // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+ // subsampling). The indices of the rows that are stored are specified in
+ // |kLoopRestorationBorderRows|.
+ YuvBuffer loop_restoration_border;
+ // The size of this dynamic buffer is |tile_rows|.
+ DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
+ TileScratchBufferPool tile_scratch_buffer_pool;
+ ThreadingStrategy threading_strategy;
+ std::mutex superblock_row_mutex;
+ // The size of this buffer is the number of superblock rows.
+ // |superblock_row_progress[i]| is incremented whenever a tile finishes
+ // decoding superblock row at index i. If the count reaches tile_columns, then
+ // |superblock_row_progress_condvar[i]| is notified.
+ DynamicBuffer<int> superblock_row_progress
+ LIBGAV1_GUARDED_BY(superblock_row_mutex);
+ // The size of this buffer is the number of superblock rows. Used to wait for
+ // |superblock_row_progress[i]| to reach tile_columns.
+ DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+ // Used to signal tile decoding failure in the combined multithreading mode.
+ bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
+};
+
+class FrameScratchBufferPool {
+ public:
+ std::unique_ptr<FrameScratchBuffer> Get() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (!buffers_.Empty()) {
+ return buffers_.Pop();
+ }
+ lock.unlock();
+ std::unique_ptr<FrameScratchBuffer> scratch_buffer(new (std::nothrow)
+ FrameScratchBuffer);
+ return scratch_buffer;
+ }
+
+ void Release(std::unique_ptr<FrameScratchBuffer> scratch_buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(scratch_buffer));
+ }
+
+ private:
+ std::mutex mutex_;
+ Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
+ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_H_
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+// IWYU pragma: begin_exports
+#include "gav1/decoder_buffer.h"
+#include "gav1/decoder_settings.h"
+#include "gav1/frame_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+#include "gav1/version.h"
+// IWYU pragma: end_exports
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct Libgav1Decoder;
+typedef struct Libgav1Decoder Libgav1Decoder;
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate(
+ const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out);
+
+LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame(
+ Libgav1Decoder* decoder, const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame(
+ Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr);
+
+LIBGAV1_PUBLIC Libgav1StatusCode
+Libgav1DecoderSignalEOS(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Forward declaration.
+class DecoderImpl;
+
+class LIBGAV1_PUBLIC Decoder {
+ public:
+ Decoder();
+ ~Decoder();
+
+ // Init must be called exactly once per instance. Subsequent calls will do
+ // nothing. If |settings| is nullptr, the decoder will be initialized with
+ // default settings. Returns kStatusOk on success, an error status otherwise.
+ StatusCode Init(const DecoderSettings* settings);
+
+ // Enqueues a compressed frame to be decoded.
+ //
+ // This function returns:
+ // * kStatusOk on success
+ // * kStatusTryAgain if the decoder queue is full
+ // * an error status otherwise.
+ //
+ // |user_private_data| may be used to associate application specific private
+ // data with the compressed frame. It will be copied to the user_private_data
+ // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
+ // call.
+ //
+ // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
+ // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
+ // alive until:
+ // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+ // must be kept alive until release_input_buffer is called with the
+ // |buffer_private_data| passed into this EnqueueFrame call.
+ // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+ // be kept alive until the corresponding DequeueFrame() call is completed.
+ //
+ // If the call to |EnqueueFrame()| is not successful, then libgav1 will not
+ // hold any references to the |data| buffer. |settings_.release_input_buffer|
+ // callback will not be called in that case.
+ StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+ int64_t user_private_data, void* buffer_private_data);
+
+ // Dequeues a decompressed frame. If there are enqueued compressed frames,
+ // decodes one and sets |*out_ptr| to the last displayable frame in the
+ // compressed frame. If there are no displayable frames available, sets
+ // |*out_ptr| to nullptr.
+ //
+ // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are
+ // no enqueued frames (in this case out_ptr will always be set to nullptr).
+ // Returns one of the other error statuses if there is an error.
+ //
+ // If |settings_.blocking_dequeue| is false and the decoder is operating in
+ // frame parallel mode (|settings_.frame_parallel| is true and the video
+ // stream passes the decoder's heuristics for enabling frame parallel mode),
+ // then this call will return kStatusTryAgain if an enqueued frame is not yet
+ // decoded (it is a non blocking call in this case). In all other cases, this
+ // call will block until an enqueued frame has been decoded.
+ StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+
+ // Signals the end of stream.
+ //
+ // In non-frame-parallel mode, this function will release all the frames held
+ // by the decoder. If the frame buffers were allocated by libgav1, then the
+ // pointer obtained by the prior DequeueFrame call will no longer be valid. If
+ // the frame buffers were allocated by the application, then any references
+ // that libgav1 is holding on to will be released.
+ //
+ // Once this function returns successfully, the decoder state will be reset
+ // and the decoder is ready to start decoding a new coded video sequence.
+ StatusCode SignalEOS();
+
+ // Returns the maximum bitdepth that is supported by this decoder.
+ static int GetMaxBitdepth();
+
+ private:
+ DecoderSettings settings_;
+ // The object is initialized if and only if impl_ != nullptr.
+ std::unique_ptr<DecoderImpl> impl_;
+};
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_DECODER_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+// The documentation for the enum values in this file can be found in Section
+// 6.4.2 of the AV1 spec.
+
+typedef enum Libgav1ChromaSamplePosition {
+ kLibgav1ChromaSamplePositionUnknown,
+ kLibgav1ChromaSamplePositionVertical,
+ kLibgav1ChromaSamplePositionColocated,
+ kLibgav1ChromaSamplePositionReserved
+} Libgav1ChromaSamplePosition;
+
+typedef enum Libgav1ImageFormat {
+ kLibgav1ImageFormatYuv420,
+ kLibgav1ImageFormatYuv422,
+ kLibgav1ImageFormatYuv444,
+ kLibgav1ImageFormatMonochrome400
+} Libgav1ImageFormat;
+
+typedef enum Libgav1ColorPrimary {
+ // 0 is reserved.
+ kLibgav1ColorPrimaryBt709 = 1,
+ kLibgav1ColorPrimaryUnspecified,
+ // 3 is reserved.
+ kLibgav1ColorPrimaryBt470M = 4,
+ kLibgav1ColorPrimaryBt470Bg,
+ kLibgav1ColorPrimaryBt601,
+ kLibgav1ColorPrimarySmpte240,
+ kLibgav1ColorPrimaryGenericFilm,
+ kLibgav1ColorPrimaryBt2020,
+ kLibgav1ColorPrimaryXyz,
+ kLibgav1ColorPrimarySmpte431,
+ kLibgav1ColorPrimarySmpte432,
+ // 13-21 are reserved.
+ kLibgav1ColorPrimaryEbu3213 = 22,
+ // 23-254 are reserved.
+ kLibgav1MaxColorPrimaries = 255
+} Libgav1ColorPrimary;
+
+typedef enum Libgav1TransferCharacteristics {
+ // 0 is reserved.
+ kLibgav1TransferCharacteristicsBt709 = 1,
+ kLibgav1TransferCharacteristicsUnspecified,
+ // 3 is reserved.
+ kLibgav1TransferCharacteristicsBt470M = 4,
+ kLibgav1TransferCharacteristicsBt470Bg,
+ kLibgav1TransferCharacteristicsBt601,
+ kLibgav1TransferCharacteristicsSmpte240,
+ kLibgav1TransferCharacteristicsLinear,
+ kLibgav1TransferCharacteristicsLog100,
+ kLibgav1TransferCharacteristicsLog100Sqrt10,
+ kLibgav1TransferCharacteristicsIec61966,
+ kLibgav1TransferCharacteristicsBt1361,
+ kLibgav1TransferCharacteristicsSrgb,
+ kLibgav1TransferCharacteristicsBt2020TenBit,
+ kLibgav1TransferCharacteristicsBt2020TwelveBit,
+ kLibgav1TransferCharacteristicsSmpte2084,
+ kLibgav1TransferCharacteristicsSmpte428,
+ kLibgav1TransferCharacteristicsHlg,
+ // 19-254 are reserved.
+ kLibgav1MaxTransferCharacteristics = 255
+} Libgav1TransferCharacteristics;
+
+typedef enum Libgav1MatrixCoefficients {
+ kLibgav1MatrixCoefficientsIdentity,
+ kLibgav1MatrixCoefficientsBt709,
+ kLibgav1MatrixCoefficientsUnspecified,
+ // 3 is reserved.
+ kLibgav1MatrixCoefficientsFcc = 4,
+ kLibgav1MatrixCoefficientsBt470BG,
+ kLibgav1MatrixCoefficientsBt601,
+ kLibgav1MatrixCoefficientsSmpte240,
+ kLibgav1MatrixCoefficientsSmpteYcgco,
+ kLibgav1MatrixCoefficientsBt2020Ncl,
+ kLibgav1MatrixCoefficientsBt2020Cl,
+ kLibgav1MatrixCoefficientsSmpte2085,
+ kLibgav1MatrixCoefficientsChromatNcl,
+ kLibgav1MatrixCoefficientsChromatCl,
+ kLibgav1MatrixCoefficientsIctcp,
+ // 15-254 are reserved.
+ kLibgav1MaxMatrixCoefficients = 255
+} Libgav1MatrixCoefficients;
+
+typedef enum Libgav1ColorRange {
+ // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit
+ // streams.
+ kLibgav1ColorRangeStudio, // Y [16..235], UV [16..240]
+ kLibgav1ColorRangeFull // YUV/RGB [0..255]
+} Libgav1ColorRange;
+
+// Section 6.7.3.
+typedef struct Libgav1ObuMetadataHdrCll { // NOLINT
+ uint16_t max_cll; // Maximum content light level.
+ uint16_t max_fall; // Maximum frame-average light level.
+} Libgav1ObuMetadataHdrCll;
+
+// Section 6.7.4.
+typedef struct Libgav1ObuMetadataHdrMdcv { // NOLINT
+ // 0.16 fixed-point X/Y chromaticity coordinate as defined by CIE 1931 in
+ // R/G/B order.
+ uint16_t primary_chromaticity_x[3];
+ uint16_t primary_chromaticity_y[3];
+ // 0.16 fixed-point X/Y chromaticity coordinate as defined by CIE 1931.
+ uint16_t white_point_chromaticity_x;
+ uint16_t white_point_chromaticity_y;
+ // 24.8 fixed-point maximum luminance, represented in candelas per square
+ // meter.
+ uint32_t luminance_max;
+ // 18.14 fixed-point minimum luminance, represented in candelas per square
+ // meter.
+ uint32_t luminance_min;
+} Libgav1ObuMetadataHdrMdcv;
+
+// Section 6.7.2.
+typedef struct Libgav1ObuMetadataItutT35 { // NOLINT
+ uint8_t country_code;
+ uint8_t country_code_extension_byte; // Valid if country_code is 0xFF.
+ uint8_t* payload_bytes;
+ int payload_size;
+} Libgav1ObuMetadataItutT35;
+
+typedef struct Libgav1DecoderBuffer {
+#if defined(__cplusplus)
+ LIBGAV1_PUBLIC int NumPlanes() const {
+ return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3;
+ }
+#endif // defined(__cplusplus)
+
+ Libgav1ChromaSamplePosition chroma_sample_position;
+ Libgav1ImageFormat image_format;
+ Libgav1ColorRange color_range;
+ Libgav1ColorPrimary color_primary;
+ Libgav1TransferCharacteristics transfer_characteristics;
+ Libgav1MatrixCoefficients matrix_coefficients;
+
+ int bitdepth; // Stored image bitdepth.
+
+ // Image display dimensions in Y/U/V order.
+ int displayed_width[3]; // Displayed image width.
+ int displayed_height[3]; // Displayed image height.
+
+ // Values are given in Y/U/V order.
+ int stride[3]; // The width in bytes of one row of the |plane| buffer.
+ // This may include padding bytes for alignment or
+ // internal use by the decoder.
+ uint8_t* plane[3]; // The reconstructed image plane(s).
+
+ // Spatial id of this frame.
+ int spatial_id;
+ // Temporal id of this frame.
+ int temporal_id;
+
+ Libgav1ObuMetadataHdrCll hdr_cll;
+ int has_hdr_cll; // 1 if the values in hdr_cll are valid for this frame. 0
+ // otherwise.
+
+ Libgav1ObuMetadataHdrMdcv hdr_mdcv;
+ int has_hdr_mdcv; // 1 if the values in hdr_mdcv are valid for this frame. 0
+ // otherwise.
+
+ Libgav1ObuMetadataItutT35 itut_t35;
+ int has_itut_t35; // 1 if the values in itut_t35 are valid for this frame. 0
+ // otherwise.
+
+ // The |user_private_data| argument passed to Decoder::EnqueueFrame().
+ int64_t user_private_data;
+ // The |private_data| field of FrameBuffer. Set by the get frame buffer
+ // callback when it allocates a frame buffer.
+ void* buffer_private_data;
+} Libgav1DecoderBuffer;
+
+#if defined(__cplusplus)
+namespace libgav1 {
+
+using ChromaSamplePosition = Libgav1ChromaSamplePosition;
+constexpr ChromaSamplePosition kChromaSamplePositionUnknown =
+ kLibgav1ChromaSamplePositionUnknown;
+constexpr ChromaSamplePosition kChromaSamplePositionVertical =
+ kLibgav1ChromaSamplePositionVertical;
+constexpr ChromaSamplePosition kChromaSamplePositionColocated =
+ kLibgav1ChromaSamplePositionColocated;
+constexpr ChromaSamplePosition kChromaSamplePositionReserved =
+ kLibgav1ChromaSamplePositionReserved;
+
+using ImageFormat = Libgav1ImageFormat;
+constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420;
+constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422;
+constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444;
+constexpr ImageFormat kImageFormatMonochrome400 =
+ kLibgav1ImageFormatMonochrome400;
+
+using ColorPrimary = Libgav1ColorPrimary;
+constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709;
+constexpr ColorPrimary kColorPrimaryUnspecified =
+ kLibgav1ColorPrimaryUnspecified;
+constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M;
+constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg;
+constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601;
+constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240;
+constexpr ColorPrimary kColorPrimaryGenericFilm =
+ kLibgav1ColorPrimaryGenericFilm;
+constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020;
+constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz;
+constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431;
+constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432;
+constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213;
+constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries;
+
+using TransferCharacteristics = Libgav1TransferCharacteristics;
+constexpr TransferCharacteristics kTransferCharacteristicsBt709 =
+ kLibgav1TransferCharacteristicsBt709;
+constexpr TransferCharacteristics kTransferCharacteristicsUnspecified =
+ kLibgav1TransferCharacteristicsUnspecified;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470M =
+ kLibgav1TransferCharacteristicsBt470M;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg =
+ kLibgav1TransferCharacteristicsBt470Bg;
+constexpr TransferCharacteristics kTransferCharacteristicsBt601 =
+ kLibgav1TransferCharacteristicsBt601;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 =
+ kLibgav1TransferCharacteristicsSmpte240;
+constexpr TransferCharacteristics kTransferCharacteristicsLinear =
+ kLibgav1TransferCharacteristicsLinear;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100 =
+ kLibgav1TransferCharacteristicsLog100;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 =
+ kLibgav1TransferCharacteristicsLog100Sqrt10;
+constexpr TransferCharacteristics kTransferCharacteristicsIec61966 =
+ kLibgav1TransferCharacteristicsIec61966;
+constexpr TransferCharacteristics kTransferCharacteristicsBt1361 =
+ kLibgav1TransferCharacteristicsBt1361;
+constexpr TransferCharacteristics kTransferCharacteristicsSrgb =
+ kLibgav1TransferCharacteristicsSrgb;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit =
+ kLibgav1TransferCharacteristicsBt2020TenBit;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit =
+ kLibgav1TransferCharacteristicsBt2020TwelveBit;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 =
+ kLibgav1TransferCharacteristicsSmpte2084;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 =
+ kLibgav1TransferCharacteristicsSmpte428;
+constexpr TransferCharacteristics kTransferCharacteristicsHlg =
+ kLibgav1TransferCharacteristicsHlg;
+constexpr TransferCharacteristics kMaxTransferCharacteristics =
+ kLibgav1MaxTransferCharacteristics;
+
+using MatrixCoefficients = Libgav1MatrixCoefficients;
+constexpr MatrixCoefficients kMatrixCoefficientsIdentity =
+ kLibgav1MatrixCoefficientsIdentity;
+constexpr MatrixCoefficients kMatrixCoefficientsBt709 =
+ kLibgav1MatrixCoefficientsBt709;
+constexpr MatrixCoefficients kMatrixCoefficientsUnspecified =
+ kLibgav1MatrixCoefficientsUnspecified;
+constexpr MatrixCoefficients kMatrixCoefficientsFcc =
+ kLibgav1MatrixCoefficientsFcc;
+constexpr MatrixCoefficients kMatrixCoefficientsBt470BG =
+ kLibgav1MatrixCoefficientsBt470BG;
+constexpr MatrixCoefficients kMatrixCoefficientsBt601 =
+ kLibgav1MatrixCoefficientsBt601;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 =
+ kLibgav1MatrixCoefficientsSmpte240;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco =
+ kLibgav1MatrixCoefficientsSmpteYcgco;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl =
+ kLibgav1MatrixCoefficientsBt2020Ncl;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl =
+ kLibgav1MatrixCoefficientsBt2020Cl;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 =
+ kLibgav1MatrixCoefficientsSmpte2085;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl =
+ kLibgav1MatrixCoefficientsChromatNcl;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatCl =
+ kLibgav1MatrixCoefficientsChromatCl;
+constexpr MatrixCoefficients kMatrixCoefficientsIctcp =
+ kLibgav1MatrixCoefficientsIctcp;
+constexpr MatrixCoefficients kMaxMatrixCoefficients =
+ kLibgav1MaxMatrixCoefficients;
+
+using ColorRange = Libgav1ColorRange;
+constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
+constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+
+using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll;
+using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv;
+using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35;
+
+using DecoderBuffer = Libgav1DecoderBuffer;
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/frame_buffer.h"
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This callback is invoked by the decoder when it is done using an input frame
+// buffer. When frame_parallel is set to true, this callback must not be
+// nullptr. Otherwise, this callback is optional.
+//
+// |buffer_private_data| is the value passed in the EnqueueFrame() call.
+typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
+ void* buffer_private_data);
+
+typedef struct Libgav1DecoderSettings {
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
+ int threads;
+ // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+ // Note that this is just a request and the decoder will decide the number of
+ // frames to be decoded in parallel based on the video stream being decoded.
+ int frame_parallel;
+ // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
+ // until a enqueued frame is available for dequeueing.
+ //
+ // If frame_parallel is 0, this setting is ignored.
+ int blocking_dequeue;
+ // Called when the first sequence header or a sequence header with a
+ // different frame size (which includes bitdepth, monochrome, subsampling_x,
+ // subsampling_y, maximum frame width, or maximum frame height) is received.
+ Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed;
+ // Get frame buffer callback.
+ Libgav1GetFrameBufferCallback get_frame_buffer;
+ // Release frame buffer callback.
+ Libgav1ReleaseFrameBufferCallback release_frame_buffer;
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
+ Libgav1ReleaseInputBufferCallback release_input_buffer;
+ // Passed as the private_data argument to the callbacks.
+ void* callback_private_data;
+ // A boolean. If set to 1, the decoder will output all the spatial and
+ // temporal layers.
+ int output_all_layers;
+ // Index of the operating point to decode.
+ int operating_point;
+ // Mask indicating the post processing filters that need to be applied to the
+ // reconstructed frame. Note this is an advanced setting and does not
+ // typically need to be changed.
+ // From LSB:
+ // Bit 0: Loop filter (deblocking filter).
+ // Bit 1: Cdef.
+ // Bit 2: SuperRes.
+ // Bit 3: Loop restoration.
+ // Bit 4: Film grain synthesis.
+ // All the bits other than the last 5 are ignored.
+ uint8_t post_filter_mask;
+} Libgav1DecoderSettings;
+
+LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault(
+ Libgav1DecoderSettings* settings);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
+
+// Applications must populate this structure before creating a decoder instance.
+struct DecoderSettings {
+ // Number of threads to use when decoding. Must be greater than 0. The library
+ // will create at most |threads| new threads. Defaults to 1 (no new threads
+ // will be created).
+ int threads = 1;
+ // Indicate to the decoder that frame parallel decoding is allowed. Note that
+ // this is just a request and the decoder will decide the number of frames to
+ // be decoded in parallel based on the video stream being decoded.
+ bool frame_parallel = false;
+ // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
+ // available for dequeueing.
+ //
+ // If frame_parallel is false, this setting is ignored.
+ bool blocking_dequeue = false;
+ // Called when the first sequence header or a sequence header with a
+ // different frame size (which includes bitdepth, monochrome, subsampling_x,
+ // subsampling_y, maximum frame width, or maximum frame height) is received.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+ // Get frame buffer callback.
+ GetFrameBufferCallback get_frame_buffer = nullptr;
+ // Release frame buffer callback.
+ ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
+ ReleaseInputBufferCallback release_input_buffer = nullptr;
+ // Passed as the private_data argument to the callbacks.
+ void* callback_private_data = nullptr;
+ // If set to true, the decoder will output all the spatial and temporal
+ // layers.
+ bool output_all_layers = false;
+ // Index of the operating point to decode.
+ int operating_point = 0;
+ // Mask indicating the post processing filters that need to be applied to the
+ // reconstructed frame. Note this is an advanced setting and does not
+ // typically need to be changed.
+ // From LSB:
+ // Bit 0: Loop filter (deblocking filter).
+ // Bit 1: Cdef.
+ // Bit 2: SuperRes.
+ // Bit 3: Loop restoration.
+ // Bit 4: Film grain synthesis.
+ // All the bits other than the last 5 are ignored.
+ uint8_t post_filter_mask = 0x1f;
+};
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+#endif // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif // defined(__cplusplus)
+
+#include "gav1/decoder_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+
+// The callback functions use the C linkage conventions.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This structure represents an allocated frame buffer.
+typedef struct Libgav1FrameBuffer {
+ // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2
+ // are for the Y, U, and V planes, respectively.
+ uint8_t* plane[3]; // Pointers to the frame (excluding the borders) in the
+ // data buffers.
+ int stride[3]; // Row strides in bytes.
+ void* private_data; // Frame buffer's private data. Available for use by the
+ // release frame buffer callback. Also copied to the
+ // |buffer_private_data| field of DecoderBuffer for use
+ // by the consumer of a DecoderBuffer.
+} Libgav1FrameBuffer;
+
+// This callback is invoked by the decoder to provide information on the
+// subsequent frames in the video, until the next invocation of this callback
+// or the end of the video.
+//
+// |width| and |height| are the maximum frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// maximum left, right, top, and bottom border sizes in pixels.
+// |stride_alignment| specifies the alignment of the row stride in bytes.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+//
+// NOTE: This callback may be omitted if the information is not useful to the
+// application.
+typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment);
+
+// This callback is invoked by the decoder to allocate a frame buffer, which
+// consists of three data buffers, for the Y, U, and V planes, respectively.
+//
+// The callback must set |frame_buffer->plane[i]| to point to the data buffers
+// of the planes, and set |frame_buffer->stride[i]| to the row strides of the
+// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback
+// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null
+// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to
+// 0. The callback may set |frame_buffer->private_data| to a value that will
+// be useful to the release frame buffer callback and the consumer of a
+// DecoderBuffer.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+
+// |width| and |height| are the frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// left, right, top, and bottom border sizes in pixels. |stride_alignment|
+// specifies the alignment of the row stride in bytes.
+typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+// After a frame buffer is allocated, the decoder starts to write decoded video
+// to the frame buffer. When the frame buffer is ready for consumption, it is
+// made available to the application in a Decoder::DequeueFrame() call.
+// Afterwards, the decoder may continue to use the frame buffer in read-only
+// mode. When the decoder is finished using the frame buffer, it notifies the
+// application by calling the Libgav1ReleaseFrameBufferCallback.
+
+// This callback is invoked by the decoder to release a frame buffer.
+typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data,
+ void* buffer_private_data);
+
+// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to
+// help clients implement frame buffer callbacks using memory buffers. First,
+// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of
+// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size
+// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and
+// buffer_private_data to Libgav1SetFrameBuffer().
+
+// This structure contains information useful for allocating memory for a frame
+// buffer.
+typedef struct Libgav1FrameBufferInfo {
+ size_t y_buffer_size; // Size in bytes of the Y buffer.
+ size_t uv_buffer_size; // Size in bytes of the U or V buffer.
+
+ // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use
+ // them directly.
+ int y_stride; // Row stride in bytes of the Y buffer.
+ int uv_stride; // Row stride in bytes of the U or V buffer.
+ size_t y_plane_offset; // Offset in bytes of the frame (excluding the
+ // borders) in the Y buffer.
+ size_t uv_plane_offset; // Offset in bytes of the frame (excluding the
+ // borders) in the U or V buffer.
+ int stride_alignment; // The stride_alignment argument passed to
+ // Libgav1ComputeFrameBufferInfo().
+} Libgav1FrameBufferInfo;
+
+// Computes the information useful for allocating memory for a frame buffer.
+// On success, stores the output in |info|.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBufferInfo* info);
+
+// Sets the |frame_buffer| struct.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer(
+ const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer,
+ uint8_t* v_buffer, void* buffer_private_data,
+ Libgav1FrameBuffer* frame_buffer);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+// Declare type aliases for C++.
+namespace libgav1 {
+
+using FrameBuffer = Libgav1FrameBuffer;
+using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback;
+using GetFrameBufferCallback = Libgav1GetFrameBufferCallback;
+using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback;
+using FrameBufferInfo = Libgav1FrameBufferInfo;
+
+inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format,
+ int width, int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border,
+ int stride_alignment,
+ FrameBufferInfo* info) {
+ return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height,
+ left_border, right_border, top_border,
+ bottom_border, stride_alignment, info);
+}
+
+inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer,
+ uint8_t* u_buffer, uint8_t* v_buffer,
+ void* buffer_private_data,
+ FrameBuffer* frame_buffer) {
+ return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer,
+ buffer_private_data, frame_buffer);
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+// The Libgav1StatusCode enum type: A libgav1 function may return
+// Libgav1StatusCode to indicate success or the reason for failure.
+typedef enum {
+ // Success.
+ kLibgav1StatusOk = 0,
+
+ // An unknown error. Used as the default error status if error detail is not
+ // available.
+ kLibgav1StatusUnknownError = -1,
+
+ // An invalid function argument.
+ kLibgav1StatusInvalidArgument = -2,
+
+ // Memory allocation failure.
+ kLibgav1StatusOutOfMemory = -3,
+
+ // Ran out of a resource (other than memory).
+ kLibgav1StatusResourceExhausted = -4,
+
+ // The object is not initialized.
+ kLibgav1StatusNotInitialized = -5,
+
+ // An operation that can only be performed once has already been performed.
+ kLibgav1StatusAlready = -6,
+
+ // Not implemented, or not supported.
+ kLibgav1StatusUnimplemented = -7,
+
+ // An internal error in libgav1. Usually this indicates a programming error.
+ kLibgav1StatusInternalError = -8,
+
+ // The bitstream is not encoded correctly or violates a bitstream conformance
+ // requirement.
+ kLibgav1StatusBitstreamError = -9,
+
+ // The operation is not allowed at the moment. This is not a fatal error. Try
+ // again later.
+ kLibgav1StatusTryAgain = -10,
+
+ // Used only by DequeueFrame(). There are no enqueued frames, so there is
+ // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before
+ // trying to dequeue again.
+ kLibgav1StatusNothingToDequeue = -11,
+
+ // An extra enumerator to prevent people from writing code that fails to
+ // compile when a new status code is added.
+ //
+ // Do not reference this enumerator. In particular, if you write code that
+ // switches on Libgav1StatusCode, add a default: case instead of a case that
+ // mentions this enumerator.
+ //
+ // Do not depend on the value (currently -1000) listed here. It may change in
+ // the future.
+ kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000
+} Libgav1StatusCode;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Declare type aliases for C++.
+using StatusCode = Libgav1StatusCode;
+constexpr StatusCode kStatusOk = kLibgav1StatusOk;
+constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError;
+constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument;
+constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory;
+constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted;
+constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized;
+constexpr StatusCode kStatusAlready = kLibgav1StatusAlready;
+constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented;
+constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError;
+constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError;
+constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain;
+constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue;
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+inline const char* GetErrorString(StatusCode status) {
+ return Libgav1GetErrorString(status);
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_STATUS_CODE_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+
+// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined
+// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts
+// symbol availability when users use the shared object form of libgav1. The
+// intent is to prevent exposure of libgav1 internals to users of the library,
+// and to avoid ABI compatibility problems that changes to libgav1 internals
+// would cause for users of the libgav1 shared object.
+//
+// Examples:
+//
+// This form makes a class and all of its members part of the public API:
+//
+// class LIBGAV1_PUBLIC A {
+// public:
+// A();
+// ~A();
+// void Foo();
+// int Bar();
+// };
+//
+// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to
+// the shared object when this form is used.
+//
+// This form exposes a single class method as part of the public API:
+//
+// class B {
+// public:
+// B();
+// ~B();
+// LIBGAV1_PUBLIC int Foo();
+// };
+//
+// In this examples only B::Foo() is available to the user of the shared object.
+//
+// Non-class member functions can also be exposed individually:
+//
+// LIBGAV1_PUBLIC void Bar();
+//
+// In this example Bar() would be available to users of the shared object.
+//
+// Much of the above information and more can be found at
+// https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
+
+#if !defined(LIBGAV1_PUBLIC)
+#if defined(_WIN32)
+#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#if defined(__GNUC__)
+#define LIBGAV1_PUBLIC __attribute__((dllexport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllexport)
+#endif // defined(__GNUC__)
+#elif defined(LIBGAV1_BUILDING_DLL)
+#ifdef __GNUC__
+#define LIBGAV1_PUBLIC __attribute__((dllimport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllimport)
+#endif // defined(__GNUC__)
+#else
+#define LIBGAV1_PUBLIC
+#endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#else // !defined(_WIN32)
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
+#else
+#define LIBGAV1_PUBLIC
+#endif
+#endif // defined(_WIN32)
+#endif // defined(LIBGAV1_PUBLIC)
+
+#endif // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_VERSION_H_
+#define LIBGAV1_SRC_GAV1_VERSION_H_
+
+#include "gav1/symbol_visibility.h"
+
+// This library follows the principles described by Semantic Versioning
+// (https://semver.org).
+
+#define LIBGAV1_MAJOR_VERSION 0
+#define LIBGAV1_MINOR_VERSION 19
+#define LIBGAV1_PATCH_VERSION 0
+
+#define LIBGAV1_VERSION \
+ ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
+ LIBGAV1_PATCH_VERSION)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+LIBGAV1_PUBLIC int Libgav1GetVersion(void);
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void);
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+
+namespace libgav1 {
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+inline int GetVersion() { return Libgav1GetVersion(); }
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+inline const char* GetVersionString() { return Libgav1GetVersionString(); }
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+inline const char* GetBuildConfiguration() {
+ return Libgav1GetBuildConfiguration();
+}
+
+} // namespace libgav1
+#endif // defined(__cplusplus)
+
+#endif // LIBGAV1_SRC_GAV1_VERSION_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the inter intra masks
+// from the code where it is used.
+
+// The tables in this file are computed based on section 7.11.3.13 in the spec.
+
+constexpr uint8_t kInterIntraMaskDc[] = {
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+constexpr uint8_t kInterIntraMaskVertical4x4[] = {
+ 60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2};
+constexpr uint8_t kInterIntraMaskVertical4x8[] = {
+ 60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11,
+ 6, 6, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical8x4[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+ 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskVertical8x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+ 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11,
+ 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4,
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical8x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
+ 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19,
+ 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8,
+ 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5,
+ 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3,
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical16x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+constexpr uint8_t kInterIntraMaskVertical16x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical16x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskVertical32x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+constexpr uint8_t kInterIntraMaskVertical32x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2,
+ 60, 19, 6, 2, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskHorizontal4x8[] = {
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11,
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskHorizontal8x4[] = {
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x8[] = {
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+ 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x16[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34,
+ 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15,
+ 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60,
+ 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26,
+ 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11,
+ 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45,
+ 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskHorizontal16x8[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34,
+ 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15,
+ 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6,
+ 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3,
+ 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1,
+ 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45,
+ 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal16x16[] = {
+ 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34,
+ 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15,
+ 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6,
+ 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3,
+ 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1,
+ 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45,
+ 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19,
+ 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8,
+ 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4,
+ 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2,
+ 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60,
+ 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26,
+ 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11,
+ 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal16x32[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7};
+constexpr uint8_t kInterIntraMaskHorizontal32x16[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4,
+ 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+constexpr uint8_t kInterIntraMaskHorizontal32x32[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4,
+ 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4,
+ 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2,
+ 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1,
+ 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1,
+ 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+
+constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19,
+ 60, 19, 6, 6, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskSmooth4x8[] = {
+ 60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11,
+ 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x4[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+ 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+ 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11,
+ 60, 34, 19, 11, 6, 6, 6, 6, 60, 34, 19, 11, 6, 4, 4, 4,
+ 60, 34, 19, 11, 6, 4, 2, 2, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskSmooth8x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34,
+ 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19,
+ 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60,
+ 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26,
+ 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11,
+ 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45,
+ 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x8[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+ 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+ 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 60, 45, 34, 26, 19,
+ 15, 11, 8, 6, 6, 6, 6, 6, 6, 6, 6, 60, 45, 34, 26, 19, 15, 11, 8,
+ 6, 5, 5, 5, 5, 5, 5, 5, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4,
+ 4, 4, 4, 4, 4, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 3, 3,
+ 3, 3, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60,
+ 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60, 45, 34, 26,
+ 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11,
+ 8, 6, 5, 4, 3, 2, 2, 1, 1};
+constexpr uint8_t kInterIntraMaskSmooth16x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52,
+ 45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13,
+ 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 8, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7};
+constexpr uint8_t kInterIntraMaskSmooth32x16[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+ 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+constexpr uint8_t kInterIntraMaskSmooth32x32[] = {
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+ 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+ 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+ 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+ 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+ 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 60,
+ 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26,
+ 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+ 10, 8, 7, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30,
+ 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+ 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6,
+ 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 60, 52, 45, 39, 34,
+ 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+ 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6,
+ 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39,
+ 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+ 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1,
+ 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45,
+ 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3,
+ 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+ 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+ 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1};
+
+// For each 2D array within this array, the indices are mapped as follows: 0, 1,
+// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32
+// respectively. For example, the entry in [1][2] corresponds to a prediction
+// size of 8x16 (width == 8 and height == 16).
+const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = {
+ // kInterIntraModeDc. This is a special case where all the non-nullptr
+ // entries point to kInterIntraMaskDc (all entries of the array are 32). The
+ // width can be set according to the prediction size to achieve the desired
+ // result.
+ {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr},
+ {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr},
+ {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc},
+ {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}},
+ // kInterIntraModeVertical
+ {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr},
+ {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8,
+ kInterIntraMaskVertical8x16, nullptr},
+ {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16,
+ kInterIntraMaskVertical16x32},
+ {nullptr, nullptr, kInterIntraMaskVertical32x16,
+ kInterIntraMaskVertical32x32}},
+ // kInterIntraModeHorizontal
+ {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr,
+ nullptr},
+ {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8,
+ kInterIntraMaskHorizontal8x16, nullptr},
+ {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16,
+ kInterIntraMaskHorizontal16x32},
+ {nullptr, nullptr, kInterIntraMaskHorizontal32x16,
+ kInterIntraMaskHorizontal32x32}},
+ // kInterIntraModeSmooth
+ {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr},
+ {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8,
+ kInterIntraMaskSmooth8x16, nullptr},
+ {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16,
+ kInterIntraMaskSmooth16x32},
+ {nullptr, nullptr, kInterIntraMaskSmooth32x16,
+ kInterIntraMaskSmooth32x32}}};
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+extern "C" {
+
+Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ return buffer_list->OnFrameBufferSizeChanged(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment);
+}
+
+Libgav1StatusCode GetInternalFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ return buffer_list->GetFrameBuffer(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void ReleaseInternalFrameBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_list =
+ static_cast<InternalFrameBufferList*>(callback_private_data);
+ buffer_list->ReleaseFrameBuffer(buffer_private_data);
+}
+
+} // extern "C"
+
+StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged(
+ int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/,
+ int /*height*/, int /*left_border*/, int /*right_border*/,
+ int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) {
+ return kStatusOk;
+}
+
+StatusCode InternalFrameBufferList::GetFrameBuffer(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+ FrameBufferInfo info;
+ StatusCode status = ComputeFrameBufferInfo(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, &info);
+ if (status != kStatusOk) return status;
+
+ if (info.uv_buffer_size > SIZE_MAX / 2 ||
+ info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) {
+ return kStatusInvalidArgument;
+ }
+ const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size;
+
+ Buffer* buffer = nullptr;
+ for (auto& buffer_ptr : buffers_) {
+ if (!buffer_ptr->in_use) {
+ buffer = buffer_ptr.get();
+ break;
+ }
+ }
+ if (buffer == nullptr) {
+ std::unique_ptr<Buffer> new_buffer(new (std::nothrow) Buffer);
+ if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) {
+ return kStatusOutOfMemory;
+ }
+ buffer = buffers_.back().get();
+ }
+
+ if (buffer->size < min_size) {
+ std::unique_ptr<uint8_t[], MallocDeleter> new_data(
+ static_cast<uint8_t*>(malloc(min_size)));
+ if (new_data == nullptr) return kStatusOutOfMemory;
+ buffer->data = std::move(new_data);
+ buffer->size = min_size;
+ }
+
+ uint8_t* const y_buffer = buffer->data.get();
+ uint8_t* const u_buffer =
+ (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size;
+ uint8_t* const v_buffer =
+ (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size;
+ status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer,
+ frame_buffer);
+ if (status != kStatusOk) return status;
+ buffer->in_use = true;
+ return kStatusOk;
+}
+
+void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) {
+ auto* const buffer = static_cast<Buffer*>(buffer_private_data);
+ buffer->in_use = false;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment);
+
+extern "C" Libgav1StatusCode GetInternalFrameBuffer(
+ void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+ int width, int height, int left_border, int right_border, int top_border,
+ int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data,
+ void* buffer_private_data);
+
+class InternalFrameBufferList : public Allocable {
+ public:
+ InternalFrameBufferList() = default;
+
+ // Not copyable or movable.
+ InternalFrameBufferList(const InternalFrameBufferList&) = delete;
+ InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete;
+
+ ~InternalFrameBufferList() = default;
+
+ Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth,
+ Libgav1ImageFormat image_format,
+ int width, int height,
+ int left_border, int right_border,
+ int top_border, int bottom_border,
+ int stride_alignment);
+
+ Libgav1StatusCode GetFrameBuffer(int bitdepth,
+ Libgav1ImageFormat image_format, int width,
+ int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border, int stride_alignment,
+ Libgav1FrameBuffer* frame_buffer);
+
+ void ReleaseFrameBuffer(void* buffer_private_data);
+
+ private:
+ struct Buffer : public Allocable {
+ std::unique_ptr<uint8_t[], MallocDeleter> data;
+ size_t size = 0;
+ bool in_use = false;
+ };
+
+ Vector<std::unique_ptr<Buffer>> buffers_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+class InternalFrameBufferListTest : public testing::Test {
+ protected:
+ static constexpr int kBufferListSize = 10;
+
+ InternalFrameBufferListTest() {
+ on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+ get_frame_buffer_ = GetInternalFrameBuffer;
+ release_frame_buffer_ = ReleaseInternalFrameBuffer;
+ callback_private_data_ = &buffer_list_;
+ }
+
+ // Frame buffer callbacks.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+ GetFrameBufferCallback get_frame_buffer_;
+ ReleaseFrameBufferCallback release_frame_buffer_;
+ // Private data associated with the frame buffer callbacks.
+ void* callback_private_data_;
+
+ private:
+ InternalFrameBufferList buffer_list_;
+};
+
+TEST_F(InternalFrameBufferListTest, ReleaseInRandomOrder) {
+ const int bitdepth = 8;
+ const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+ const int width = 100;
+ const int height = 50;
+ const int left_border = 0;
+ const int right_border = 0;
+ const int top_border = 0;
+ const int bottom_border = 0;
+ const int stride_alignment = 16;
+
+ EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+ image_format, width, height,
+ left_border, right_border, top_border,
+ bottom_border, stride_alignment),
+ 0);
+
+ FrameBuffer frame_buffers[kBufferListSize];
+ for (auto& frame_buffer : frame_buffers) {
+ EXPECT_EQ(
+ get_frame_buffer_(callback_private_data_, bitdepth, image_format, width,
+ height, left_border, right_border, top_border,
+ bottom_border, stride_alignment, &frame_buffer),
+ 0);
+ EXPECT_NE(frame_buffer.plane[0], nullptr);
+ EXPECT_GE(frame_buffer.stride[0], 112);
+ EXPECT_NE(frame_buffer.plane[1], nullptr);
+ EXPECT_GE(frame_buffer.stride[1], 64);
+ EXPECT_NE(frame_buffer.plane[2], nullptr);
+ EXPECT_GE(frame_buffer.stride[2], 64);
+ }
+
+ // Release and get a few buffers at indexes <= 5 in random order.
+ static_assert(5 < kBufferListSize, "");
+ static constexpr int indexes[] = {1, 4, 5, 5, 4, 3, 2, 3, 5, 0};
+ for (int index : indexes) {
+ release_frame_buffer_(callback_private_data_,
+ frame_buffers[index].private_data);
+
+ EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+ width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment,
+ &frame_buffers[index]),
+ 0);
+ EXPECT_NE(frame_buffers[index].plane[0], nullptr);
+ EXPECT_GE(frame_buffers[index].stride[0], 112);
+ EXPECT_NE(frame_buffers[index].plane[1], nullptr);
+ EXPECT_GE(frame_buffers[index].stride[1], 64);
+ EXPECT_NE(frame_buffers[index].plane[2], nullptr);
+ EXPECT_GE(frame_buffers[index].stride[2], 64);
+ }
+
+ for (auto& frame_buffer : frame_buffers) {
+ release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+ }
+}
+
+TEST_F(InternalFrameBufferListTest, VaryingBufferSizes) {
+ const int bitdepth = 8;
+ const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+ const int width = 64;
+ const int height = 48;
+ const int left_border = 16;
+ const int right_border = 16;
+ const int top_border = 16;
+ const int bottom_border = 16;
+ const int stride_alignment = 16;
+
+ EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+ image_format, 16 * width, 16 * height,
+ left_border, right_border, top_border,
+ bottom_border, stride_alignment),
+ 0);
+
+ FrameBuffer frame_buffer;
+ for (int i = 1; i <= 16; ++i) {
+ EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+ i * width, i * height, left_border,
+ right_border, top_border, bottom_border,
+ stride_alignment, &frame_buffer),
+ 0);
+ EXPECT_NE(frame_buffer.plane[0], nullptr);
+ EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+ EXPECT_NE(frame_buffer.plane[1], nullptr);
+ EXPECT_GE(frame_buffer.stride[1],
+ (i * width + left_border + right_border) >> 1);
+ EXPECT_NE(frame_buffer.plane[2], nullptr);
+ EXPECT_GE(frame_buffer.stride[2],
+ (i * width + left_border + right_border) >> 1);
+ release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+ }
+ for (int i = 16; i >= 1; --i) {
+ EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+ i * width, i * height, left_border,
+ right_border, top_border, bottom_border,
+ stride_alignment, &frame_buffer),
+ 0);
+ EXPECT_NE(frame_buffer.plane[0], nullptr);
+ EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+ EXPECT_NE(frame_buffer.plane[1], nullptr);
+ EXPECT_GE(frame_buffer.stride[1],
+ (i * width + left_border + right_border) >> 1);
+ EXPECT_NE(frame_buffer.plane[2], nullptr);
+ EXPECT_GE(frame_buffer.stride[2],
+ (i * width + left_border + right_border) >> 1);
+ release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_)
+ return()
+endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_
+set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1)
+
+list(APPEND libgav1_decoder_sources
+ "${libgav1_source}/buffer_pool.cc"
+ "${libgav1_source}/buffer_pool.h"
+ "${libgav1_source}/decoder_impl.cc"
+ "${libgav1_source}/decoder_impl.h"
+ "${libgav1_source}/decoder_state.h"
+ "${libgav1_source}/tile_scratch_buffer.cc"
+ "${libgav1_source}/tile_scratch_buffer.h"
+ "${libgav1_source}/film_grain.cc"
+ "${libgav1_source}/film_grain.h"
+ "${libgav1_source}/frame_buffer.cc"
+ "${libgav1_source}/frame_buffer_utils.h"
+ "${libgav1_source}/frame_scratch_buffer.h"
+ "${libgav1_source}/inter_intra_masks.inc"
+ "${libgav1_source}/internal_frame_buffer_list.cc"
+ "${libgav1_source}/internal_frame_buffer_list.h"
+ "${libgav1_source}/loop_restoration_info.cc"
+ "${libgav1_source}/loop_restoration_info.h"
+ "${libgav1_source}/motion_vector.cc"
+ "${libgav1_source}/motion_vector.h"
+ "${libgav1_source}/obu_parser.cc"
+ "${libgav1_source}/obu_parser.h"
+ "${libgav1_source}/post_filter/cdef.cc"
+ "${libgav1_source}/post_filter/deblock.cc"
+ "${libgav1_source}/post_filter/deblock_thresholds.inc"
+ "${libgav1_source}/post_filter/loop_restoration.cc"
+ "${libgav1_source}/post_filter/post_filter.cc"
+ "${libgav1_source}/post_filter/super_res.cc"
+ "${libgav1_source}/post_filter.h"
+ "${libgav1_source}/prediction_mask.cc"
+ "${libgav1_source}/prediction_mask.h"
+ "${libgav1_source}/quantizer.cc"
+ "${libgav1_source}/quantizer.h"
+ "${libgav1_source}/quantizer_tables.inc"
+ "${libgav1_source}/reconstruction.cc"
+ "${libgav1_source}/reconstruction.h"
+ "${libgav1_source}/residual_buffer_pool.cc"
+ "${libgav1_source}/residual_buffer_pool.h"
+ "${libgav1_source}/scan_tables.inc"
+ "${libgav1_source}/symbol_decoder_context.cc"
+ "${libgav1_source}/symbol_decoder_context.h"
+ "${libgav1_source}/symbol_decoder_context_cdfs.inc"
+ "${libgav1_source}/threading_strategy.cc"
+ "${libgav1_source}/threading_strategy.h"
+ "${libgav1_source}/tile.h"
+ "${libgav1_source}/tile/bitstream/mode_info.cc"
+ "${libgav1_source}/tile/bitstream/palette.cc"
+ "${libgav1_source}/tile/bitstream/partition.cc"
+ "${libgav1_source}/tile/bitstream/transform_size.cc"
+ "${libgav1_source}/tile/prediction.cc"
+ "${libgav1_source}/tile/tile.cc"
+ "${libgav1_source}/warp_prediction.cc"
+ "${libgav1_source}/warp_prediction.h"
+ "${libgav1_source}/yuv_buffer.cc"
+ "${libgav1_source}/yuv_buffer.h")
+
+list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h"
+ "${libgav1_source}/gav1/decoder_buffer.h"
+ "${libgav1_source}/gav1/decoder_settings.h"
+ "${libgav1_source}/gav1/frame_buffer.h"
+ "${libgav1_source}/gav1/status_code.h"
+ "${libgav1_source}/gav1/symbol_visibility.h"
+ "${libgav1_source}/gav1/version.h")
+
+list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc"
+ "${libgav1_source}/decoder_settings.cc"
+ "${libgav1_source}/status_code.cc"
+ "${libgav1_source}/version.cc"
+ ${libgav1_api_includes})
+
+macro(libgav1_add_decoder_targets)
+ if(BUILD_SHARED_LIBS)
+ if(MSVC OR WIN32)
+ # In order to produce a DLL and import library the Windows tools require
+ # that the exported symbols are part of the DLL target. The unfortunate
+ # side effect of this is that a single configuration cannot output both
+ # the static library and the DLL: This results in an either/or situation.
+ # Windows users of the libgav1 build can have a DLL and an import library,
+ # or they can have a static library; they cannot have both from a single
+ # configuration of the build.
+ list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources})
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_includes})
+ else()
+ list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes})
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+ endif()
+ else()
+ list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+ endif()
+
+ if(use_absl_threading)
+ list(APPEND libgav1_absl_deps absl::base absl::synchronization)
+ endif()
+
+ libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES
+ ${libgav1_decoder_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_library(NAME
+ libgav1_static
+ OUTPUT_NAME
+ libgav1
+ TYPE
+ STATIC
+ SOURCES
+ ${libgav1_static_lib_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ ${libgav1_absl_deps}
+ OBJLIB_DEPS
+ libgav1_dsp
+ libgav1_decoder
+ libgav1_utils
+ PUBLIC_INCLUDES
+ ${libgav1_source})
+
+ if(BUILD_SHARED_LIBS)
+ libgav1_add_library(NAME
+ libgav1_shared
+ OUTPUT_NAME
+ libgav1
+ TYPE
+ SHARED
+ SOURCES
+ ${libgav1_shared_lib_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ libgav1_static
+ PUBLIC_INCLUDES
+ ${libgav1_source})
+ endif()
+endmacro()
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/loop_restoration_info.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Controls how self guided deltas are read.
+constexpr int kSgrProjReadControl = 4;
+// Maps the restoration type encoded in the compressed headers (restoration_type
+// element in the spec) of the bitstream to LoopRestorationType. This is used
+// only when the restoration type in the frame header is
+// LoopRestorationTypeSwitchable.
+constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = {
+ kLoopRestorationTypeNone, kLoopRestorationTypeWiener,
+ kLoopRestorationTypeSgrProj};
+
+inline int CountLeadingZeroCoefficients(const int16_t* const filter) {
+ int number_zero_coefficients = 0;
+ if (filter[0] == 0) {
+ number_zero_coefficients++;
+ if (filter[1] == 0) {
+ number_zero_coefficients++;
+ if (filter[2] == 0) {
+ number_zero_coefficients++;
+ }
+ }
+ }
+ return number_zero_coefficients;
+}
+
+} // namespace
+
+bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration,
+ uint32_t width, uint32_t height,
+ int8_t subsampling_x, int8_t subsampling_y,
+ bool is_monochrome) {
+ loop_restoration_ = loop_restoration;
+ subsampling_x_ = subsampling_x;
+ subsampling_y_ = subsampling_y;
+
+ const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes;
+ int total_num_units = 0;
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+ plane_needs_filtering_[plane] = false;
+ continue;
+ }
+ plane_needs_filtering_[plane] = true;
+ const int plane_width =
+ (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
+ const int plane_height =
+ (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+ num_horizontal_units_[plane] =
+ std::max(1, RightShiftWithRounding(
+ plane_width, loop_restoration_->unit_size_log2[plane]));
+ num_vertical_units_[plane] = std::max(
+ 1, RightShiftWithRounding(plane_height,
+ loop_restoration_->unit_size_log2[plane]));
+ num_units_[plane] =
+ num_horizontal_units_[plane] * num_vertical_units_[plane];
+ total_num_units += num_units_[plane];
+ }
+ // Allocate the RestorationUnitInfo arrays for all planes in a single heap
+ // allocation and divide up the buffer into arrays of the right sizes.
+ if (!loop_restoration_info_buffer_.Resize(total_num_units)) {
+ return false;
+ }
+ RestorationUnitInfo* loop_restoration_info =
+ loop_restoration_info_buffer_.get();
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ loop_restoration_info_[plane] = loop_restoration_info;
+ loop_restoration_info += num_units_[plane];
+ }
+ return true;
+}
+
+bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock(
+ Plane plane, BlockSize block_size, bool is_superres_scaled,
+ uint8_t superres_scale_denominator, int row4x4, int column4x4,
+ LoopRestorationUnitInfo* const unit_info) const {
+ assert(unit_info != nullptr);
+ if (!plane_needs_filtering_[plane]) return false;
+ const int numerator_column =
+ is_superres_scaled ? superres_scale_denominator : 1;
+ const int pixel_column_start =
+ RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
+ const int pixel_column_end = RowOrColumn4x4ToPixel(
+ column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
+ const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+ const int denominator_column_log2 =
+ unit_row_log2 + (is_superres_scaled ? 3 : 0);
+ const int pixel_row_start =
+ RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
+ const int pixel_row_end = RowOrColumn4x4ToPixel(
+ row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
+ unit_info->column_start = RightShiftWithCeiling(
+ pixel_column_start * numerator_column, denominator_column_log2);
+ unit_info->column_end = RightShiftWithCeiling(
+ pixel_column_end * numerator_column, denominator_column_log2);
+ unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+ unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
+ unit_info->column_end =
+ std::min(unit_info->column_end, num_horizontal_units_[plane]);
+ unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
+ return true;
+}
+
+void LoopRestorationInfo::ReadUnitCoefficients(
+ EntropyDecoder* const reader,
+ SymbolDecoderContext* const symbol_decoder_context, Plane plane,
+ int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone;
+ if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) {
+ unit_restoration_type = kBitstreamRestorationTypeMap
+ [reader->ReadSymbol<kRestorationTypeSymbolCount>(
+ symbol_decoder_context->restoration_type_cdf)];
+ } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) {
+ const bool use_wiener =
+ reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf);
+ if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener;
+ } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) {
+ const bool use_sgrproj =
+ reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf);
+ if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj;
+ }
+ loop_restoration_info_[plane][unit_id].type = unit_restoration_type;
+
+ if (unit_restoration_type == kLoopRestorationTypeWiener) {
+ ReadWienerInfo(reader, plane, unit_id, reference_unit_info);
+ } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) {
+ ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info);
+ }
+}
+
+void LoopRestorationInfo::ReadWienerInfo(
+ EntropyDecoder* const reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ if (plane != kPlaneY) {
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0;
+ }
+ int sum = 0;
+ for (int j = static_cast<int>(plane != kPlaneY); j < kNumWienerCoefficients;
+ ++j) {
+ const int8_t wiener_min = kWienerTapsMin[j];
+ const int8_t wiener_max = kWienerTapsMax[j];
+ const int control = j + 1;
+ int value;
+ if (!reader->DecodeSignedSubexpWithReference(
+ wiener_min, wiener_max + 1,
+ (*reference_unit_info)[plane].wiener_info.filter[i][j], control,
+ &value)) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Error decoding Wiener filter coefficients: plane %d, unit_id %d",
+ static_cast<int>(plane), unit_id);
+ return;
+ }
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value;
+ (*reference_unit_info)[plane].wiener_info.filter[i][j] = value;
+ sum += value;
+ }
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] =
+ 128 - 2 * sum;
+ loop_restoration_info_[plane][unit_id]
+ .wiener_info.number_leading_zero_coefficients[i] =
+ CountLeadingZeroCoefficients(
+ loop_restoration_info_[plane][unit_id].wiener_info.filter[i]);
+ }
+}
+
+void LoopRestorationInfo::ReadSgrProjInfo(
+ EntropyDecoder* const reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+ const int sgr_proj_index =
+ static_cast<int>(reader->ReadLiteral(kSgrProjParamsBits));
+ loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index;
+ for (int i = 0; i < 2; ++i) {
+ const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2];
+ const int8_t multiplier_min = kSgrProjMultiplierMin[i];
+ const int8_t multiplier_max = kSgrProjMultiplierMax[i];
+ int multiplier;
+ if (radius != 0) {
+ if (!reader->DecodeSignedSubexpWithReference(
+ multiplier_min, multiplier_max + 1,
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[i],
+ kSgrProjReadControl, &multiplier)) {
+ LIBGAV1_DLOG(ERROR,
+ "Error decoding Self-guided filter coefficients: plane "
+ "%d, unit_id %d",
+ static_cast<int>(plane), unit_id);
+ return;
+ }
+ } else {
+ // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0]
+ // from DecodeSignedSubexpWithReference() is [-96, 31], the default is
+ // -32, making Clip3(128 - 31, -32, 95) unnecessary.
+ static constexpr int kMultiplier[2] = {0, 95};
+ multiplier = kMultiplier[i];
+ assert(
+ i == 0 ||
+ Clip3((1 << kSgrProjPrecisionBits) -
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[0],
+ multiplier_min, multiplier_max) == kMultiplier[1]);
+ }
+ loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] =
+ multiplier;
+ (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier;
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/dsp/common.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct LoopRestorationUnitInfo {
+ int row_start;
+ int row_end;
+ int column_start;
+ int column_end;
+};
+
+class LoopRestorationInfo {
+ public:
+ LoopRestorationInfo() = default;
+
+ // Non copyable/movable.
+ LoopRestorationInfo(const LoopRestorationInfo&) = delete;
+ LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete;
+ LoopRestorationInfo(LoopRestorationInfo&&) = delete;
+ LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete;
+
+ bool Reset(const LoopRestoration* loop_restoration, uint32_t width,
+ uint32_t height, int8_t subsampling_x, int8_t subsampling_y,
+ bool is_monochrome);
+ // Populates the |unit_info| for the super block at |row4x4|, |column4x4|.
+ // Returns true on success, false otherwise.
+ bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size,
+ bool is_superres_scaled,
+ uint8_t superres_scale_denominator,
+ int row4x4, int column4x4,
+ LoopRestorationUnitInfo* unit_info) const;
+ void ReadUnitCoefficients(EntropyDecoder* reader,
+ SymbolDecoderContext* symbol_decoder_context,
+ Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>*
+ reference_unit_info); // 5.11.58.
+ void ReadWienerInfo(
+ EntropyDecoder* reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+ void ReadSgrProjInfo(
+ EntropyDecoder* reader, Plane plane, int unit_id,
+ std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+
+ // Getters.
+ const RestorationUnitInfo* loop_restoration_info(Plane plane,
+ int unit_id) const {
+ return &loop_restoration_info_[plane][unit_id];
+ }
+
+ int num_horizontal_units(Plane plane) const {
+ return num_horizontal_units_[plane];
+ }
+ int num_vertical_units(Plane plane) const {
+ return num_vertical_units_[plane];
+ }
+ int num_units(Plane plane) const { return num_units_[plane]; }
+
+ private:
+ // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane]
+ // points to an array of num_units_[plane] elements.
+ RestorationUnitInfo* loop_restoration_info_[kMaxPlanes];
+ // Owns the memory that loop_restoration_info_[plane] points to.
+ DynamicBuffer<RestorationUnitInfo> loop_restoration_info_buffer_;
+ bool plane_needs_filtering_[kMaxPlanes];
+ const LoopRestoration* loop_restoration_;
+ int8_t subsampling_x_;
+ int8_t subsampling_y_;
+ int num_horizontal_units_[kMaxPlanes];
+ int num_vertical_units_[kMaxPlanes];
+ int num_units_[kMaxPlanes];
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/motion_vector.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Entry at index i is computed as:
+// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)).
+constexpr int kWarpValidThreshold[kMaxBlockSizes] = {
+ 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 32,
+ 64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112};
+
+// 7.10.2.10.
+void LowerMvPrecision(const ObuFrameHeader& frame_header,
+ MotionVector* const mvs) {
+ if (frame_header.allow_high_precision_mv) return;
+ if (frame_header.force_integer_mv != 0) {
+ for (auto& mv : mvs->mv) {
+ // The next line is equivalent to:
+ // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+ // const int sign = mv >> 15;
+ // mv = ApplySign(value, sign);
+ mv = (mv + 3 - (mv >> 15)) & ~7;
+ }
+ } else {
+ for (auto& mv : mvs->mv) {
+ // The next line is equivalent to:
+ // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+ mv = (mv - (mv >> 15)) & ~1;
+ }
+ }
+}
+
+// 7.10.2.1.
+void SetupGlobalMv(const Tile::Block& block, int index,
+ MotionVector* const mv) {
+ const BlockParameters& bp = *block.bp;
+ const ObuFrameHeader& frame_header = block.tile.frame_header();
+ ReferenceFrameType reference_type = bp.reference_frame[index];
+ const auto& gm = frame_header.global_motion[reference_type];
+ if (reference_type == kReferenceFrameIntra ||
+ gm.type == kGlobalMotionTransformationTypeIdentity) {
+ mv->mv32 = 0;
+ return;
+ }
+ if (gm.type == kGlobalMotionTransformationTypeTranslation) {
+ for (int i = 0; i < 2; ++i) {
+ mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
+ }
+ LowerMvPrecision(frame_header, mv);
+ return;
+ }
+ const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1;
+ const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1;
+ const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x +
+ gm.params[3] * y + gm.params[0];
+ const int yc = gm.params[4] * x +
+ (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y +
+ gm.params[1];
+ if (frame_header.allow_high_precision_mv) {
+ mv->mv[0] = RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3);
+ mv->mv[1] = RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3);
+ } else {
+ mv->mv[0] = MultiplyBy2(
+ RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2));
+ mv->mv[1] = MultiplyBy2(
+ RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2));
+ LowerMvPrecision(frame_header, mv);
+ }
+}
+
+constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv,
+ kPredictionModeNewNewMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv,
+ kPredictionModeNearestNewMv,
+ kPredictionModeNewNearestMv);
+
+// 7.10.2.8.
+void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+ int index, int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const BlockParameters& bp = *block.bp;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+ block.tile.frame_header().global_motion;
+ PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ MotionVector candidate_mv;
+ // LowerMvPrecision() is not necessary, since the values in
+ // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+ const auto global_motion_type = global_motion[bp.reference_frame[0]].type;
+ if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+ candidate_mv = prediction_parameters.global_mv[0];
+ } else {
+ candidate_mv = mv_bp.mv.mv[index];
+ }
+ *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+ *found_match = true;
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ const int num_found = *num_mv_found;
+ const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv.mv32 == candidate_mv.mv32;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
+ weight);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+ ++*num_mv_found;
+}
+
+// 7.10.2.9.
+void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+ int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const BlockParameters& bp = *block.bp;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+ block.tile.frame_header().global_motion;
+ PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ // LowerMvPrecision() is not necessary, since the values in
+ // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+ CompoundMotionVector candidate_mv = mv_bp.mv;
+ for (int i = 0; i < 2; ++i) {
+ const auto global_motion_type = global_motion[bp.reference_frame[i]].type;
+ if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+ candidate_mv.mv[i] = prediction_parameters.global_mv[i];
+ }
+ }
+ *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+ *found_match = true;
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters.compound_ref_mv_stack;
+ const int num_found = *num_mv_found;
+ const auto result =
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+ [&candidate_mv](const CompoundMotionVector& ref_mv) {
+ return ref_mv.mv64 == candidate_mv.mv64;
+ });
+ if (result != compound_ref_mv_stack + num_found) {
+ prediction_parameters.IncreaseWeight(
+ std::distance(compound_ref_mv_stack, result), weight);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+ ++*num_mv_found;
+}
+
+// 7.10.2.7.
+void AddReferenceMvCandidate(const Tile::Block& block,
+ const BlockParameters& mv_bp, bool is_compound,
+ int weight, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ if (!mv_bp.is_inter) return;
+ const BlockParameters& bp = *block.bp;
+ if (is_compound) {
+ if (mv_bp.reference_frame[0] == bp.reference_frame[0] &&
+ mv_bp.reference_frame[1] == bp.reference_frame[1]) {
+ CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match,
+ num_mv_found);
+ }
+ return;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (mv_bp.reference_frame[i] == bp.reference_frame[0]) {
+ SearchStack(block, mv_bp, i, weight, found_new_mv, found_match,
+ num_mv_found);
+ }
+ }
+}
+
+int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) {
+ assert(delta_row_or_column < 0);
+ if (block_width_or_height4x4 >= 16) return 4;
+ if (delta_row_or_column < -1) return 2;
+ return 0;
+}
+
+// 7.10.2.2.
+void ScanRow(const Tile::Block& block, int mv_column, int delta_row,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_row = block.row4x4 + delta_row;
+ const Tile& tile = block.tile;
+ if (!tile.IsTopInside(mv_row + 1)) return;
+ const int width4x4 = block.width4x4;
+ const int min_step = GetMinimumStep(width4x4, delta_row);
+ BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+ BlockParameters** const end_bps =
+ bps + std::min({static_cast<int>(width4x4),
+ tile.frame_header().columns4x4 - block.column4x4, 16});
+ do {
+ const BlockParameters& mv_bp = **bps;
+ const int step = std::max(
+ std::min(width4x4, static_cast<int>(kNum4x4BlocksWide[mv_bp.size])),
+ min_step);
+ AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+ found_new_mv, found_match, num_mv_found);
+ bps += step;
+ } while (bps < end_bps);
+}
+
+// 7.10.2.3.
+void ScanColumn(const Tile::Block& block, int mv_row, int delta_column,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsLeftInside(mv_column + 1)) return;
+ const int height4x4 = block.height4x4;
+ const int min_step = GetMinimumStep(height4x4, delta_column);
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+ BlockParameters** const end_bps =
+ bps + stride * std::min({static_cast<int>(height4x4),
+ tile.frame_header().rows4x4 - block.row4x4, 16});
+ do {
+ const BlockParameters& mv_bp = **bps;
+ const int step = std::max(
+ std::min(height4x4, static_cast<int>(kNum4x4BlocksHigh[mv_bp.size])),
+ min_step);
+ AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+ found_new_mv, found_match, num_mv_found);
+ bps += step * stride;
+ } while (bps < end_bps);
+}
+
+// 7.10.2.4.
+void ScanPoint(const Tile::Block& block, int delta_row, int delta_column,
+ bool is_compound, bool* const found_new_mv,
+ bool* const found_match, int* const num_mv_found) {
+ const int mv_row = block.row4x4 + delta_row;
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsInside(mv_row, mv_column) ||
+ !tile.HasParameters(mv_row, mv_column)) {
+ return;
+ }
+ const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+ if (mv_bp.reference_frame[0] == kReferenceFrameNone) return;
+ AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv,
+ found_match, num_mv_found);
+}
+
+// 7.10.2.6.
+void AddTemporalReferenceMvCandidate(
+ const ObuFrameHeader& frame_header, const int reference_offsets[2],
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, int count, bool is_compound,
+ int* const zero_mv_context, int* const num_mv_found,
+ PredictionParameters* const prediction_parameters) {
+ const int mv_projection_function_index =
+ frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv;
+ const MotionVector* const global_mv = prediction_parameters->global_mv;
+ if (is_compound) {
+ alignas(kMaxAlignment)
+ CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.mv_projection_compound[mv_projection_function_index](
+ temporal_mvs, temporal_reference_offsets, reference_offsets, count,
+ candidate_mvs);
+ if (*zero_mv_context == -1) {
+ int max_difference =
+ std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]),
+ std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1]));
+ max_difference =
+ std::max(max_difference,
+ std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0]));
+ max_difference =
+ std::max(max_difference,
+ std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters->compound_ref_mv_stack;
+ int num_found = *num_mv_found;
+ int index = 0;
+ do {
+ const CompoundMotionVector& candidate_mv = candidate_mvs[index];
+ const auto result =
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+ [&candidate_mv](const CompoundMotionVector& ref_mv) {
+ return ref_mv.mv64 == candidate_mv.mv64;
+ });
+ if (result != compound_ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(
+ std::distance(compound_ref_mv_stack, result), 2);
+ continue;
+ }
+ if (num_found >= kMaxRefMvStackSize) continue;
+ compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
+ } while (++index < count);
+ *num_mv_found = num_found;
+ return;
+ }
+ MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
+ if (reference_offsets[0] == 0) {
+ if (*zero_mv_context == -1) {
+ const int max_difference =
+ std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ const MotionVector candidate_mv = {};
+ const int num_found = *num_mv_found;
+ const auto result =
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv.mv32 == candidate_mv.mv32;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+ 2 * count);
+ return;
+ }
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
+ ++*num_mv_found;
+ return;
+ }
+ alignas(kMaxAlignment)
+ MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.mv_projection_single[mv_projection_function_index](
+ temporal_mvs, temporal_reference_offsets, reference_offsets[0], count,
+ candidate_mvs);
+ if (*zero_mv_context == -1) {
+ const int max_difference =
+ std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]),
+ std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
+ *zero_mv_context = static_cast<int>(max_difference >= 16);
+ }
+ int num_found = *num_mv_found;
+ int index = 0;
+ do {
+ const MotionVector& candidate_mv = candidate_mvs[index];
+ const auto result =
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+ [&candidate_mv](const MotionVector& ref_mv) {
+ return ref_mv.mv32 == candidate_mv.mv32;
+ });
+ if (result != ref_mv_stack + num_found) {
+ prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+ 2);
+ continue;
+ }
+ if (num_found >= kMaxRefMvStackSize) continue;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
+ } while (++index < count);
+ *num_mv_found = num_found;
+}
+
+// Part of 7.10.2.5.
+bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row,
+ int delta_column) {
+ const int row = (block.row4x4 & 15) + delta_row;
+ const int column = (block.column4x4 & 15) + delta_column;
+ // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|.
+ // So |row| are all non-negative.
+ assert(row >= 0);
+ return row < 16 && column >= 0 && column < 16;
+}
+
+constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32,
+ kBlock16x8, kBlock16x16, kBlock16x32,
+ kBlock32x8, kBlock32x16, kBlock32x32);
+
+// 7.10.2.5.
+void TemporalScan(const Tile::Block& block, bool is_compound,
+ int* const zero_mv_context, int* const num_mv_found) {
+ const int step_w = (block.width4x4 >= 16) ? 4 : 2;
+ const int step_h = (block.height4x4 >= 16) ? 4 : 2;
+ const int row_start = block.row4x4 | 1;
+ const int column_start = block.column4x4 | 1;
+ const int row_end =
+ row_start + std::min(static_cast<int>(block.height4x4), 16);
+ const int column_end =
+ column_start + std::min(static_cast<int>(block.width4x4), 16);
+ const Tile& tile = block.tile;
+ const TemporalMotionField& motion_field = tile.motion_field();
+ const int stride = motion_field.mv.columns();
+ const MotionVector* motion_field_mv = motion_field.mv[0];
+ const int8_t* motion_field_reference_offset =
+ motion_field.reference_offset[0];
+ alignas(kMaxAlignment)
+ MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding];
+ int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding];
+ int count = 0;
+ int offset = stride * (row_start >> 1);
+ int mv_row = row_start;
+ do {
+ int mv_column = column_start;
+ do {
+ // Both horizontal and vertical offsets are positive. Only bottom and
+ // right boundaries need to be checked.
+ if (tile.IsBottomRightInside(mv_row, mv_column)) {
+ const int x8 = mv_column >> 1;
+ const MotionVector temporal_mv = motion_field_mv[offset + x8];
+ if (temporal_mv.mv[0] == kInvalidMvValue) {
+ if (mv_row == row_start && mv_column == column_start) {
+ *zero_mv_context = 1;
+ }
+ } else {
+ temporal_mvs[count] = temporal_mv;
+ temporal_reference_offsets[count++] =
+ motion_field_reference_offset[offset + x8];
+ }
+ }
+ mv_column += step_w;
+ } while (mv_column < column_end);
+ offset += stride * step_h >> 1;
+ mv_row += step_h;
+ } while (mv_row < row_end);
+ if (kTemporalScanMask.Contains(block.size)) {
+ const int temporal_sample_positions[3][2] = {
+ {block.height4x4, -2},
+ {block.height4x4, block.width4x4},
+ {block.height4x4 - 2, block.width4x4}};
+ // Getting the address of an element in Array2D is slow. Precalculate the
+ // offsets.
+ int temporal_sample_offsets[3];
+ temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) +
+ ((column_start - 2) >> 1);
+ temporal_sample_offsets[1] =
+ temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1);
+ temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride;
+ for (int i = 0; i < 3; i++) {
+ const int row = temporal_sample_positions[i][0];
+ const int column = temporal_sample_positions[i][1];
+ if (!IsWithinTheSame64x64Block(block, row, column)) continue;
+ const int mv_row = row_start + row;
+ const int mv_column = column_start + column;
+ // IsWithinTheSame64x64Block() guarantees the reference block is inside
+ // the top and left boundary.
+ if (!tile.IsBottomRightInside(mv_row, mv_column)) continue;
+ const MotionVector temporal_mv =
+ motion_field_mv[temporal_sample_offsets[i]];
+ if (temporal_mv.mv[0] != kInvalidMvValue) {
+ temporal_mvs[count] = temporal_mv;
+ temporal_reference_offsets[count++] =
+ motion_field_reference_offset[temporal_sample_offsets[i]];
+ }
+ }
+ }
+ if (count != 0) {
+ BlockParameters* const bp = block.bp;
+ int reference_offsets[2];
+ const int offset_0 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[0]];
+ reference_offsets[0] =
+ Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
+ if (is_compound) {
+ const int offset_1 = tile.current_frame()
+ .reference_info()
+ ->relative_distance_to[bp->reference_frame[1]];
+ reference_offsets[1] =
+ Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ if ((count & 1) != 0) {
+ temporal_mvs[count].mv32 = 0;
+ temporal_reference_offsets[count] = 0;
+ }
+ } else {
+ // Pad so that SIMD implementations won't read uninitialized memory.
+ for (int i = count; i < ((count + 3) & ~3); ++i) {
+ temporal_mvs[i].mv32 = 0;
+ temporal_reference_offsets[i] = 0;
+ }
+ }
+ AddTemporalReferenceMvCandidate(
+ tile.frame_header(), reference_offsets, temporal_mvs,
+ temporal_reference_offsets, count, is_compound, zero_mv_context,
+ num_mv_found, &(*bp->prediction_parameters));
+ }
+}
+
+// Part of 7.10.2.13.
+void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row,
+ int mv_column, int* const ref_id_count,
+ MotionVector ref_id[2][2],
+ int* const ref_diff_count,
+ MotionVector ref_diff[2][2]) {
+ const auto& bp = block.tile.Parameters(mv_row, mv_column);
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+ block.tile.reference_frame_sign_bias();
+ for (int i = 0; i < 2; ++i) {
+ const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+ if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+ for (int j = 0; j < 2; ++j) {
+ MotionVector candidate_mv = bp.mv.mv[i];
+ const ReferenceFrameType block_reference_frame =
+ block.bp->reference_frame[j];
+ if (candidate_reference_frame == block_reference_frame &&
+ ref_id_count[j] < 2) {
+ ref_id[j][ref_id_count[j]] = candidate_mv;
+ ++ref_id_count[j];
+ } else if (ref_diff_count[j] < 2) {
+ if (reference_frame_sign_bias[candidate_reference_frame] !=
+ reference_frame_sign_bias[block_reference_frame]) {
+ candidate_mv.mv[0] *= -1;
+ candidate_mv.mv[1] *= -1;
+ }
+ ref_diff[j][ref_diff_count[j]] = candidate_mv;
+ ++ref_diff_count[j];
+ }
+ }
+ }
+}
+
+// Part of 7.10.2.13.
+void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row,
+ int mv_column, int* const num_mv_found) {
+ const auto& bp = block.tile.Parameters(mv_row, mv_column);
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+ block.tile.reference_frame_sign_bias();
+ const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0];
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ int num_found = *num_mv_found;
+ for (int i = 0; i < 2; ++i) {
+ const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+ if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+ MotionVector candidate_mv = bp.mv.mv[i];
+ if (reference_frame_sign_bias[candidate_reference_frame] !=
+ reference_frame_sign_bias[block_reference_frame]) {
+ candidate_mv.mv[0] *= -1;
+ candidate_mv.mv[1] *= -1;
+ }
+ assert(num_found <= 2);
+ if ((num_found != 0 && ref_mv_stack[0].mv32 == candidate_mv.mv32) ||
+ (num_found == 2 && ref_mv_stack[1].mv32 == candidate_mv.mv32)) {
+ continue;
+ }
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+ ++num_found;
+ }
+ *num_mv_found = num_found;
+}
+
+// 7.10.2.12.
+void ExtraSearch(const Tile::Block& block, bool is_compound,
+ int* const num_mv_found) {
+ const Tile& tile = block.tile;
+ const int num4x4 = std::min({static_cast<int>(block.width4x4),
+ tile.frame_header().columns4x4 - block.column4x4,
+ static_cast<int>(block.height4x4),
+ tile.frame_header().rows4x4 - block.row4x4, 16});
+ int ref_id_count[2] = {};
+ MotionVector ref_id[2][2] = {};
+ int ref_diff_count[2] = {};
+ MotionVector ref_diff[2][2] = {};
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) {
+ for (int i = 0; i < num4x4;) {
+ const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i);
+ const int mv_column = block.column4x4 + ((pass == 0) ? i : -1);
+ if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break;
+ if (is_compound) {
+ AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count,
+ ref_id, ref_diff_count, ref_diff);
+ } else {
+ AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found);
+ if (*num_mv_found >= 2) break;
+ }
+ const auto& bp = tile.Parameters(mv_row, mv_column);
+ i +=
+ (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size];
+ }
+ }
+ if (is_compound) {
+ // Merge compound mode extra search into mv stack.
+ CompoundMotionVector* const compound_ref_mv_stack =
+ prediction_parameters.compound_ref_mv_stack;
+ CompoundMotionVector combined_mvs[2] = {};
+ for (int i = 0; i < 2; ++i) {
+ int count = 0;
+ assert(ref_id_count[i] <= 2);
+ for (int j = 0; j < ref_id_count[i]; ++j, ++count) {
+ combined_mvs[count].mv[i] = ref_id[i][j];
+ }
+ for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) {
+ combined_mvs[count].mv[i] = ref_diff[i][j];
+ }
+ for (; count < 2; ++count) {
+ combined_mvs[count].mv[i] = prediction_parameters.global_mv[i];
+ }
+ }
+ if (*num_mv_found == 1) {
+ if (combined_mvs[0].mv64 == compound_ref_mv_stack[0].mv64) {
+ compound_ref_mv_stack[1].mv64 = combined_mvs[1].mv64;
+ } else {
+ compound_ref_mv_stack[1].mv64 = combined_mvs[0].mv64;
+ }
+ prediction_parameters.SetWeightIndexStackEntry(1, 0);
+ } else {
+ assert(*num_mv_found == 0);
+ for (int i = 0; i < 2; ++i) {
+ compound_ref_mv_stack[i].mv64 = combined_mvs[i].mv64;
+ prediction_parameters.SetWeightIndexStackEntry(i, 0);
+ }
+ }
+ *num_mv_found = 2;
+ } else {
+ // single prediction mode
+ MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ for (int i = *num_mv_found; i < 2; ++i) {
+ ref_mv_stack[i] = prediction_parameters.global_mv[0];
+ prediction_parameters.SetWeightIndexStackEntry(i, 0);
+ }
+ }
+}
+
+void DescendingOrderTwo(int* const a, int* const b) {
+ if (*a < *b) {
+ std::swap(*a, *b);
+ }
+}
+
+// Comparator used for sorting candidate motion vectors in descending order of
+// their weights (as specified in 7.10.2.11).
+bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) {
+ return lhs > rhs;
+}
+
+void SortWeightIndexStack(const int size, const int sort_to_n,
+ int16_t* const weight_index_stack) {
+ if (size <= 1) return;
+ if (size <= 3) {
+ // Specialize small sort sizes to speed up.
+ int weight_index_0 = weight_index_stack[0];
+ int weight_index_1 = weight_index_stack[1];
+ DescendingOrderTwo(&weight_index_0, &weight_index_1);
+ if (size == 3) {
+ int weight_index_2 = weight_index_stack[2];
+ DescendingOrderTwo(&weight_index_1, &weight_index_2);
+ DescendingOrderTwo(&weight_index_0, &weight_index_1);
+ weight_index_stack[2] = weight_index_2;
+ }
+ weight_index_stack[0] = weight_index_0;
+ weight_index_stack[1] = weight_index_1;
+ return;
+ }
+ if (sort_to_n == 1) {
+ // std::max_element() is not efficient. Find the max element in a loop.
+ int16_t max_element = weight_index_stack[0];
+ int i = 1;
+ do {
+ max_element = std::max(max_element, weight_index_stack[i]);
+ } while (++i < size);
+ weight_index_stack[0] = max_element;
+ return;
+ }
+ std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n],
+ &weight_index_stack[size], CompareCandidateMotionVectors);
+}
+
+// 7.10.2.14 (part 2).
+void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches,
+ int* new_mv_context, int* reference_mv_context) {
+ switch (nearest_matches) {
+ case 0:
+ *new_mv_context = std::min(total_matches, 1);
+ *reference_mv_context = total_matches;
+ break;
+ case 1:
+ *new_mv_context = 3 - static_cast<int>(found_new_mv);
+ *reference_mv_context = 2 + total_matches;
+ break;
+ default:
+ *new_mv_context = 5 - static_cast<int>(found_new_mv);
+ *reference_mv_context = 5;
+ break;
+ }
+}
+
+// 7.10.4.2.
+void AddSample(const Tile::Block& block, int delta_row, int delta_column,
+ int* const num_warp_samples, int* const num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]) {
+ if (*num_samples_scanned >= kMaxLeastSquaresSamples) return;
+ const int mv_row = block.row4x4 + delta_row;
+ const int mv_column = block.column4x4 + delta_column;
+ const Tile& tile = block.tile;
+ if (!tile.IsInside(mv_row, mv_column) ||
+ !tile.HasParameters(mv_row, mv_column)) {
+ return;
+ }
+ const BlockParameters& bp = *block.bp;
+ const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+ if (mv_bp.reference_frame[0] != bp.reference_frame[0] ||
+ mv_bp.reference_frame[1] != kReferenceFrameNone) {
+ return;
+ }
+ ++*num_samples_scanned;
+ const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size];
+ const int candidate_row = mv_row & ~(candidate_height4x4 - 1);
+ const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size];
+ const int candidate_column = mv_column & ~(candidate_width4x4 - 1);
+ const BlockParameters& candidate_bp =
+ tile.Parameters(candidate_row, candidate_column);
+ const int mv_diff_row =
+ std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]);
+ const int mv_diff_column =
+ std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]);
+ const bool is_valid =
+ mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size];
+ if (!is_valid && *num_samples_scanned > 1) {
+ return;
+ }
+ const int mid_y =
+ MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1;
+ const int mid_x =
+ MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1;
+ candidates[*num_warp_samples][0] = MultiplyBy8(mid_y);
+ candidates[*num_warp_samples][1] = MultiplyBy8(mid_x);
+ candidates[*num_warp_samples][2] =
+ MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0];
+ candidates[*num_warp_samples][3] =
+ MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1];
+ if (is_valid) ++*num_warp_samples;
+}
+
+// 7.9.2.
+// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0
+// or -1 so that it can be XORed and subtracted directly in ApplySign() and
+// corresponding SIMD implementations.
+bool MotionFieldProjection(
+ const ObuFrameHeader& frame_header,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+ int y8_start, int y8_end, int x8_start, int x8_end,
+ TemporalMotionField* const motion_field) {
+ const int source_index =
+ frame_header.reference_frame_index[source - kReferenceFrameLast];
+ auto* const source_frame = reference_frames[source_index].get();
+ assert(source_frame != nullptr);
+ assert(dst_sign == 0 || dst_sign == -1);
+ if (source_frame->rows4x4() != frame_header.rows4x4 ||
+ source_frame->columns4x4() != frame_header.columns4x4 ||
+ IsIntraFrame(source_frame->frame_type())) {
+ return false;
+ }
+ assert(reference_to_current_with_sign >= -kMaxFrameDistance);
+ if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+ const ReferenceInfo& reference_info = *source_frame->reference_info();
+ const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+ dsp.motion_field_projection_kernel(
+ reference_info, reference_to_current_with_sign, dst_sign, y8_start,
+ y8_end, x8_start, x8_end, motion_field);
+ return true;
+}
+
+} // namespace
+
+void FindMvStack(const Tile::Block& block, bool is_compound,
+ MvContexts* const contexts) {
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]);
+ if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]);
+ bool found_new_mv = false;
+ bool found_row_match = false;
+ int num_mv_found = 0;
+ ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv,
+ &found_row_match, &num_mv_found);
+ bool found_column_match = false;
+ ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv,
+ &found_column_match, &num_mv_found);
+ if (std::max(block.width4x4, block.height4x4) <= 16) {
+ ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv,
+ &found_row_match, &num_mv_found);
+ }
+ const int nearest_matches =
+ static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+ prediction_parameters.nearest_mv_count = num_mv_found;
+ if (block.tile.frame_header().use_ref_frame_mvs) {
+ // Initialize to invalid value, and it will be set when temporal mv is zero.
+ contexts->zero_mv = -1;
+ TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found);
+ } else {
+ contexts->zero_mv = 0;
+ }
+ bool dummy_bool = false;
+ ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match,
+ &num_mv_found);
+ static constexpr int deltas[2] = {-3, -5};
+ for (int i = 0; i < 2; ++i) {
+ if (i == 0 || block.height4x4 > 1) {
+ ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1),
+ is_compound, &dummy_bool, &found_row_match, &num_mv_found);
+ }
+ if (i == 0 || block.width4x4 > 1) {
+ ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1),
+ is_compound, &dummy_bool, &found_column_match, &num_mv_found);
+ }
+ }
+ if (num_mv_found < 2) {
+ ExtraSearch(block, is_compound, &num_mv_found);
+ } else {
+ // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv()
+ // and Tile::AssignInterMv(), and only do a partial sort to the max index we
+ // need. However, the speed gain is trivial.
+ // For intra case, only the first 1 or 2 mvs in the stack will be used.
+ // For inter case, |prediction_parameters.ref_mv_index| is at most 3.
+ // We only need to do the partial sort up to the first 4 mvs.
+ SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4,
+ prediction_parameters.weight_index_stack);
+ // When there are 4 or more nearest mvs, the other mvs will not be used.
+ if (prediction_parameters.nearest_mv_count < 4) {
+ SortWeightIndexStack(
+ num_mv_found - prediction_parameters.nearest_mv_count,
+ 4 - prediction_parameters.nearest_mv_count,
+ prediction_parameters.weight_index_stack +
+ prediction_parameters.nearest_mv_count);
+ }
+ }
+ prediction_parameters.ref_mv_count = num_mv_found;
+ const int total_matches =
+ static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+ ComputeContexts(found_new_mv, nearest_matches, total_matches,
+ &contexts->new_mv, &contexts->reference_mv);
+ // The mv stack clamping process is in Tile::AssignIntraMv() and
+ // Tile::AssignInterMv(), and only up to two mvs are clamped.
+}
+
+void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples,
+ int* const num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]) {
+ const Tile& tile = block.tile;
+ bool top_left = true;
+ bool top_right = true;
+ int step = 1;
+ if (block.top_available[kPlaneY]) {
+ BlockSize source_size =
+ tile.Parameters(block.row4x4 - 1, block.column4x4).size;
+ const int source_width4x4 = kNum4x4BlocksWide[source_size];
+ if (block.width4x4 <= source_width4x4) {
+ // The & here is equivalent to % since source_width4x4 is a power of two.
+ const int column_offset = -(block.column4x4 & (source_width4x4 - 1));
+ if (column_offset < 0) top_left = false;
+ if (column_offset + source_width4x4 > block.width4x4) top_right = false;
+ AddSample(block, -1, 0, num_warp_samples, num_samples_scanned,
+ candidates);
+ } else {
+ for (int i = 0;
+ i < std::min(static_cast<int>(block.width4x4),
+ tile.frame_header().columns4x4 - block.column4x4);
+ i += step) {
+ source_size =
+ tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size;
+ step = std::min(static_cast<int>(block.width4x4),
+ static_cast<int>(kNum4x4BlocksWide[source_size]));
+ AddSample(block, -1, i, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ BlockSize source_size =
+ tile.Parameters(block.row4x4, block.column4x4 - 1).size;
+ const int source_height4x4 = kNum4x4BlocksHigh[source_size];
+ if (block.height4x4 <= source_height4x4) {
+ const int row_offset = -(block.row4x4 & (source_height4x4 - 1));
+ if (row_offset < 0) top_left = false;
+ AddSample(block, 0, -1, num_warp_samples, num_samples_scanned,
+ candidates);
+ } else {
+ for (int i = 0; i < std::min(static_cast<int>(block.height4x4),
+ tile.frame_header().rows4x4 - block.row4x4);
+ i += step) {
+ source_size =
+ tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size;
+ step = std::min(static_cast<int>(block.height4x4),
+ static_cast<int>(kNum4x4BlocksHigh[source_size]));
+ AddSample(block, i, -1, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ }
+ }
+ if (top_left) {
+ AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates);
+ }
+ if (top_right && block.size <= kBlock64x64) {
+ AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned,
+ candidates);
+ }
+ if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1;
+}
+
+void SetupMotionField(
+ const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+ TemporalMotionField* const motion_field) {
+ assert(frame_header.use_ref_frame_mvs);
+ const int y8_start = DivideBy2(row4x4_start);
+ const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
+ const int x8_start = DivideBy2(column4x4_start);
+ const int x8_end =
+ DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
+ const int last_index = frame_header.reference_frame_index[0];
+ const ReferenceInfo& reference_info = *current_frame.reference_info();
+ if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+ const int last_alternate_order_hint =
+ reference_frames[last_index]
+ ->reference_info()
+ ->order_hint[kReferenceFrameAlternate];
+ const int current_gold_order_hint =
+ reference_info.order_hint[kReferenceFrameGolden];
+ if (last_alternate_order_hint != current_gold_order_hint) {
+ const int reference_offset_last =
+ -reference_info.relative_distance_from[kReferenceFrameLast];
+ if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast, reference_offset_last, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
+ }
+ }
+ }
+ int ref_stamp = 1;
+ const int reference_offset_backward =
+ reference_info.relative_distance_from[kReferenceFrameBackward];
+ if (reference_offset_backward > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameBackward, reference_offset_backward,
+ 0, y8_start, y8_end, x8_start, x8_end,
+ motion_field)) {
+ --ref_stamp;
+ }
+ const int reference_offset_alternate2 =
+ reference_info.relative_distance_from[kReferenceFrameAlternate2];
+ if (reference_offset_alternate2 > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate2,
+ reference_offset_alternate2, 0, y8_start, y8_end,
+ x8_start, x8_end, motion_field)) {
+ --ref_stamp;
+ }
+ if (ref_stamp >= 0) {
+ const int reference_offset_alternate =
+ reference_info.relative_distance_from[kReferenceFrameAlternate];
+ if (reference_offset_alternate > 0 &&
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameAlternate,
+ reference_offset_alternate, 0, y8_start, y8_end,
+ x8_start, x8_end, motion_field)) {
+ --ref_stamp;
+ }
+ }
+ if (ref_stamp >= 0) {
+ const int reference_offset_last2 =
+ -reference_info.relative_distance_from[kReferenceFrameLast2];
+ if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
+ MotionFieldProjection(frame_header, reference_frames,
+ kReferenceFrameLast2, reference_offset_last2, -1,
+ y8_start, y8_end, x8_start, x8_end, motion_field);
+ }
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_
+#define LIBGAV1_SRC_MOTION_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/obu_parser.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr bool IsGlobalMvBlock(const BlockParameters& bp,
+ GlobalMotionTransformationType type) {
+ return (bp.y_mode == kPredictionModeGlobalMv ||
+ bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+ !IsBlockDimension4(bp.size) &&
+ type > kGlobalMotionTransformationTypeTranslation;
+}
+
+// The |contexts| output parameter may be null. If the caller does not need
+// the |contexts| output, pass nullptr as the argument.
+void FindMvStack(const Tile::Block& block, bool is_compound,
+ MvContexts* contexts); // 7.10.2
+
+void FindWarpSamples(const Tile::Block& block, int* num_warp_samples,
+ int* num_samples_scanned,
+ int candidates[kMaxLeastSquaresSamples][4]); // 7.10.4.
+
+// Section 7.9.1 in the spec. But this is done per tile instead of for the whole
+// frame.
+void SetupMotionField(
+ const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames,
+ int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+ TemporalMotionField* motion_field);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_MOTION_VECTOR_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/motion_vector.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// 5.9.16.
+// Find the smallest value of k such that block_size << k is greater than or
+// equal to target.
+//
+// NOTE: TileLog2(block_size, target) is equal to
+// CeilLog2(ceil((double)target / block_size))
+// where the division is a floating-point number division. (This equality holds
+// even when |target| is equal to 0.) In the special case of block_size == 1,
+// TileLog2(1, target) is equal to CeilLog2(target).
+int TileLog2(int block_size, int target) {
+ int k = 0;
+ for (; (block_size << k) < target; ++k) {
+ }
+ return k;
+}
+
+void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) {
+ level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2);
+ level->minor = level_bits & 3;
+}
+
+// This function assumes loop_filter is zero-initialized, so only it needs to
+// set the nonzero default values.
+void SetDefaultRefDeltas(LoopFilter* const loop_filter) {
+ loop_filter->ref_deltas[kReferenceFrameIntra] = 1;
+ loop_filter->ref_deltas[kReferenceFrameGolden] = -1;
+ loop_filter->ref_deltas[kReferenceFrameAlternate] = -1;
+ loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1;
+}
+
+bool InTemporalLayer(int operating_point_idc, int temporal_id) {
+ return ((operating_point_idc >> temporal_id) & 1) != 0;
+}
+
+bool InSpatialLayer(int operating_point_idc, int spatial_id) {
+ return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0;
+}
+
+// Returns the index of the last nonzero byte in the |data| buffer of |size|
+// bytes. If there is no nonzero byte in the |data| buffer, returns -1.
+int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) {
+ // Scan backward for a nonzero byte.
+ if (size > INT_MAX) return -1;
+ int i = static_cast<int>(size) - 1;
+ while (i >= 0 && data[i] == 0) {
+ --i;
+ }
+ return i;
+}
+
+// A cleanup helper class that releases the frame buffer reference held in
+// |frame| in the destructor.
+class RefCountedBufferPtrCleanup {
+ public:
+ explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
+ : frame_(*frame) {}
+
+ // Not copyable or movable.
+ RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
+ RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
+ delete;
+
+ ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+
+ private:
+ RefCountedBufferPtr& frame_;
+};
+
+} // namespace
+
+bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const {
+ // Note that the operating_parameters field is not compared per Section 7.5:
+ // Within a particular coded video sequence, the contents of
+ // sequence_header_obu must be bit-identical each time the sequence header
+ // appears except for the contents of operating_parameters_info.
+ return memcmp(this, &old,
+ offsetof(ObuSequenceHeader, operating_parameters)) != 0;
+}
+
+// Macros to avoid repeated error checks in the parser code.
+#define OBU_LOG_AND_RETURN_FALSE \
+ do { \
+ LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \
+ __func__); \
+ return false; \
+ } while (false)
+#define OBU_PARSER_FAIL \
+ do { \
+ if (scratch == -1) { \
+ OBU_LOG_AND_RETURN_FALSE; \
+ } \
+ } while (false)
+#define OBU_READ_BIT_OR_FAIL \
+ scratch = bit_reader_->ReadBit(); \
+ OBU_PARSER_FAIL
+#define OBU_READ_LITERAL_OR_FAIL(n) \
+ scratch = bit_reader_->ReadLiteral(n); \
+ OBU_PARSER_FAIL
+#define OBU_READ_UVLC_OR_FAIL(x) \
+ do { \
+ if (!bit_reader_->ReadUvlc(&(x))) { \
+ OBU_LOG_AND_RETURN_FALSE; \
+ } \
+ } while (false)
+
+bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) {
+ int64_t scratch;
+ ColorConfig* const color_config = &sequence_header->color_config;
+ OBU_READ_BIT_OR_FAIL;
+ const bool high_bitdepth = scratch != 0;
+ if (sequence_header->profile == kProfile2 && high_bitdepth) {
+ OBU_READ_BIT_OR_FAIL;
+ const bool is_twelve_bit = scratch != 0;
+ color_config->bitdepth = is_twelve_bit ? 12 : 10;
+ } else {
+ color_config->bitdepth = high_bitdepth ? 10 : 8;
+ }
+ if (sequence_header->profile == kProfile1) {
+ color_config->is_monochrome = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->is_monochrome = scratch != 0;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const bool color_description_present_flag = scratch != 0;
+ if (color_description_present_flag) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->color_primary = static_cast<ColorPrimary>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->transfer_characteristics =
+ static_cast<TransferCharacteristics>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ color_config->matrix_coefficients =
+ static_cast<MatrixCoefficients>(scratch);
+ } else {
+ color_config->color_primary = kColorPrimaryUnspecified;
+ color_config->transfer_characteristics =
+ kTransferCharacteristicsUnspecified;
+ color_config->matrix_coefficients = kMatrixCoefficientsUnspecified;
+ }
+ if (color_config->is_monochrome) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->color_range = static_cast<ColorRange>(scratch);
+ // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it
+ // easy to allow monochrome to be supported in profile 0. Profile 0
+ // requires subsampling_x and subsampling_y to be 1.
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 1;
+ color_config->chroma_sample_position = kChromaSamplePositionUnknown;
+ } else {
+ if (color_config->color_primary == kColorPrimaryBt709 &&
+ color_config->transfer_characteristics ==
+ kTransferCharacteristicsSrgb &&
+ color_config->matrix_coefficients == kMatrixCoefficientsIdentity) {
+ color_config->color_range = kColorRangeFull;
+ color_config->subsampling_x = 0;
+ color_config->subsampling_y = 0;
+ // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+ // See the table at the beginning of Section 6.4.1.
+ if (sequence_header->profile != kProfile1 &&
+ (sequence_header->profile != kProfile2 ||
+ color_config->bitdepth != 12)) {
+ LIBGAV1_DLOG(ERROR,
+ "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+ sequence_header->profile, color_config->bitdepth);
+ return false;
+ }
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->color_range = static_cast<ColorRange>(scratch);
+ if (sequence_header->profile == kProfile0) {
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 1;
+ } else if (sequence_header->profile == kProfile1) {
+ color_config->subsampling_x = 0;
+ color_config->subsampling_y = 0;
+ } else {
+ if (color_config->bitdepth == 12) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->subsampling_x = scratch;
+ if (color_config->subsampling_x == 1) {
+ OBU_READ_BIT_OR_FAIL;
+ color_config->subsampling_y = scratch;
+ } else {
+ color_config->subsampling_y = 0;
+ }
+ } else {
+ color_config->subsampling_x = 1;
+ color_config->subsampling_y = 0;
+ }
+ }
+ if (color_config->subsampling_x == 1 &&
+ color_config->subsampling_y == 1) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ color_config->chroma_sample_position =
+ static_cast<ChromaSamplePosition>(scratch);
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ color_config->separate_uv_delta_q = scratch != 0;
+ }
+ if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity &&
+ (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) {
+ LIBGAV1_DLOG(ERROR,
+ "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) "
+ "and subsampling_y (%d) are not both 0.",
+ color_config->subsampling_x, color_config->subsampling_y);
+ return false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) {
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->timing_info_present_flag = scratch != 0;
+ if (!sequence_header->timing_info_present_flag) return true;
+ TimingInfo* const info = &sequence_header->timing_info;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->num_units_in_tick = static_cast<uint32_t>(scratch);
+ if (info->num_units_in_tick == 0) {
+ LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->time_scale = static_cast<uint32_t>(scratch);
+ if (info->time_scale == 0) {
+ LIBGAV1_DLOG(ERROR, "time_scale is 0.");
+ return false;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ info->equal_picture_interval = scratch != 0;
+ if (info->equal_picture_interval) {
+ OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture);
+ ++info->num_ticks_per_picture;
+ }
+ return true;
+}
+
+bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) {
+ if (!sequence_header->timing_info_present_flag) return true;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->decoder_model_info_present_flag = scratch != 0;
+ if (!sequence_header->decoder_model_info_present_flag) return true;
+ DecoderModelInfo* const info = &sequence_header->decoder_model_info;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->encoder_decoder_buffer_delay_length = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ info->num_units_in_decoding_tick = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->buffer_removal_time_length = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ info->frame_presentation_time_length = 1 + scratch;
+ return true;
+}
+
+bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+ int index) {
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header->decoder_model_present_for_operating_point[index] =
+ scratch != 0;
+ if (!sequence_header->decoder_model_present_for_operating_point[index]) {
+ return true;
+ }
+ OperatingParameters* const params = &sequence_header->operating_parameters;
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+ params->decoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+ params->encoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ params->low_delay_mode_flag[index] = scratch != 0;
+ return true;
+}
+
+bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
+ ObuSequenceHeader sequence_header = {};
+ int64_t scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ if (scratch >= kMaxProfiles) {
+ LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast<int>(scratch));
+ return false;
+ }
+ sequence_header.profile = static_cast<BitstreamProfile>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.still_picture = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.reduced_still_picture_header = scratch != 0;
+ if (sequence_header.reduced_still_picture_header) {
+ if (!sequence_header.still_picture) {
+ LIBGAV1_DLOG(
+ ERROR, "reduced_still_picture_header is 1, but still_picture is 0.");
+ return false;
+ }
+ sequence_header.operating_points = 1;
+ sequence_header.operating_point_idc[0] = 0;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ ParseBitStreamLevel(&sequence_header.level[0], scratch);
+ } else {
+ if (!ParseTimingInfo(&sequence_header) ||
+ !ParseDecoderModelInfo(&sequence_header)) {
+ return false;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const bool initial_display_delay_present_flag = scratch != 0;
+ OBU_READ_LITERAL_OR_FAIL(5);
+ sequence_header.operating_points = static_cast<int>(1 + scratch);
+ if (operating_point_ >= sequence_header.operating_points) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Invalid operating point: %d (valid range is [0,%d] inclusive).",
+ operating_point_, sequence_header.operating_points - 1);
+ return false;
+ }
+ for (int i = 0; i < sequence_header.operating_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(12);
+ sequence_header.operating_point_idc[i] = static_cast<int>(scratch);
+ for (int j = 0; j < i; ++j) {
+ if (sequence_header.operating_point_idc[i] ==
+ sequence_header.operating_point_idc[j]) {
+ LIBGAV1_DLOG(ERROR,
+ "operating_point_idc[%d] (%d) is equal to "
+ "operating_point_idc[%d] (%d).",
+ i, sequence_header.operating_point_idc[i], j,
+ sequence_header.operating_point_idc[j]);
+ return false;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(5);
+ ParseBitStreamLevel(&sequence_header.level[i], scratch);
+ if (sequence_header.level[i].major > 3) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.tier[i] = scratch;
+ }
+ if (sequence_header.decoder_model_info_present_flag &&
+ !ParseOperatingParameters(&sequence_header, i)) {
+ return false;
+ }
+ if (initial_display_delay_present_flag) {
+ OBU_READ_BIT_OR_FAIL;
+ if (scratch != 0) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.initial_display_delay[i] = 1 + scratch;
+ }
+ }
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.frame_width_bits = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.frame_height_bits = 1 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits);
+ sequence_header.max_frame_width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits);
+ sequence_header.max_frame_height = static_cast<int32_t>(1 + scratch);
+ if (!sequence_header.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.frame_id_numbers_present = scratch != 0;
+ }
+ if (sequence_header.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ sequence_header.delta_frame_id_length_bits = 2 + scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ sequence_header.frame_id_length_bits =
+ sequence_header.delta_frame_id_length_bits + 1 + scratch;
+ // Section 6.8.2: It is a requirement of bitstream conformance that the
+ // number of bits needed to read display_frame_id does not exceed 16. This
+ // is equivalent to the constraint that idLen <= 16.
+ if (sequence_header.frame_id_length_bits > 16) {
+ LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.",
+ sequence_header.frame_id_length_bits);
+ return false;
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.use_128x128_superblock = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_filter_intra = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_intra_edge_filter = scratch != 0;
+ if (sequence_header.reduced_still_picture_header) {
+ sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_interintra_compound = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_masked_compound = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_warped_motion = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_dual_filter = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_order_hint = scratch != 0;
+ if (sequence_header.enable_order_hint) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_jnt_comp = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_ref_frame_mvs = scratch != 0;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.choose_screen_content_tools = scratch != 0;
+ if (sequence_header.choose_screen_content_tools) {
+ sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.force_screen_content_tools = scratch;
+ }
+ if (sequence_header.force_screen_content_tools > 0) {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.choose_integer_mv = scratch != 0;
+ if (sequence_header.choose_integer_mv) {
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.force_integer_mv = scratch;
+ }
+ } else {
+ sequence_header.force_integer_mv = kSelectIntegerMv;
+ }
+ if (sequence_header.enable_order_hint) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ sequence_header.order_hint_bits = 1 + scratch;
+ sequence_header.order_hint_shift_bits =
+ Mod32(32 - sequence_header.order_hint_bits);
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_superres = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_cdef = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.enable_restoration = scratch != 0;
+ if (!ParseColorConfig(&sequence_header)) return false;
+ OBU_READ_BIT_OR_FAIL;
+ sequence_header.film_grain_params_present = scratch != 0;
+ // Compare new sequence header with old sequence header.
+ if (has_sequence_header_ &&
+ sequence_header.ParametersChanged(sequence_header_)) {
+ // Between the frame header OBU and the last tile group OBU of the frame,
+ // do not allow the sequence header to change.
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
+ return false;
+ }
+ sequence_header_changed_ = true;
+ decoder_state_.ClearReferenceFrames();
+ }
+ sequence_header_ = sequence_header;
+ if (!has_sequence_header_) {
+ sequence_header_changed_ = true;
+ }
+ has_sequence_header_ = true;
+ // Section 6.4.1: It is a requirement of bitstream conformance that if
+ // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
+ // all OBUs that follow this sequence header until the next sequence header.
+ extension_disallowed_ =
+ (sequence_header_.operating_point_idc[operating_point_] == 0);
+ return true;
+}
+
+// Marks reference frames as invalid for referencing when they are too far in
+// the past to be referenced by the frame id mechanism.
+void ObuParser::MarkInvalidReferenceFrames() {
+ // The current lower bound of the frame ids for reference frames.
+ int lower_bound = decoder_state_.current_frame_id -
+ (1 << sequence_header_.delta_frame_id_length_bits);
+ // True if lower_bound is smaller than current_frame_id. False if lower_bound
+ // wraps around (in modular arithmetic) to the other side of current_frame_id.
+ bool lower_bound_is_smaller = true;
+ if (lower_bound <= 0) {
+ lower_bound += 1 << sequence_header_.frame_id_length_bits;
+ lower_bound_is_smaller = false;
+ }
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i];
+ if (lower_bound_is_smaller) {
+ if (reference_frame_id > decoder_state_.current_frame_id ||
+ reference_frame_id < lower_bound) {
+ decoder_state_.reference_frame[i] = nullptr;
+ }
+ } else {
+ if (reference_frame_id > decoder_state_.current_frame_id &&
+ reference_frame_id < lower_bound) {
+ decoder_state_.reference_frame[i] = nullptr;
+ }
+ }
+ }
+}
+
+bool ObuParser::ParseFrameSizeAndRenderSize() {
+ int64_t scratch;
+ // Frame Size.
+ if (frame_header_.frame_size_override_flag) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits);
+ frame_header_.width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits);
+ frame_header_.height = static_cast<int32_t>(1 + scratch);
+ if (frame_header_.width > sequence_header_.max_frame_width ||
+ frame_header_.height > sequence_header_.max_frame_height) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame dimensions are larger than the maximum values");
+ return false;
+ }
+ } else {
+ frame_header_.width = sequence_header_.max_frame_width;
+ frame_header_.height = sequence_header_.max_frame_height;
+ }
+ if (!ParseSuperResParametersAndComputeImageSize()) return false;
+
+ // Render Size.
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.render_and_frame_size_different = scratch != 0;
+ if (frame_header_.render_and_frame_size_different) {
+ OBU_READ_LITERAL_OR_FAIL(16);
+ frame_header_.render_width = static_cast<int32_t>(1 + scratch);
+ OBU_READ_LITERAL_OR_FAIL(16);
+ frame_header_.render_height = static_cast<int32_t>(1 + scratch);
+ } else {
+ frame_header_.render_width = frame_header_.upscaled_width;
+ frame_header_.render_height = frame_header_.height;
+ }
+
+ return true;
+}
+
+bool ObuParser::ParseSuperResParametersAndComputeImageSize() {
+ int64_t scratch;
+ // SuperRes.
+ frame_header_.upscaled_width = frame_header_.width;
+ frame_header_.use_superres = false;
+ if (sequence_header_.enable_superres) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.use_superres = scratch != 0;
+ }
+ if (frame_header_.use_superres) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ // 9 is the smallest value for the denominator.
+ frame_header_.superres_scale_denominator = scratch + 9;
+ frame_header_.width =
+ (frame_header_.upscaled_width * kSuperResScaleNumerator +
+ (frame_header_.superres_scale_denominator / 2)) /
+ frame_header_.superres_scale_denominator;
+ } else {
+ frame_header_.superres_scale_denominator = kSuperResScaleNumerator;
+ }
+ assert(frame_header_.width != 0);
+ assert(frame_header_.height != 0);
+ // Check if multiplying upscaled_width by height would overflow.
+ assert(frame_header_.upscaled_width >= frame_header_.width);
+ if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) {
+ LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.",
+ frame_header_.width, frame_header_.height);
+ return false;
+ }
+ frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1;
+ frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1;
+ return true;
+}
+
+bool ObuParser::ValidateInterFrameSize() const {
+ for (int index : frame_header_.reference_frame_index) {
+ const RefCountedBuffer* reference_frame =
+ decoder_state_.reference_frame[index].get();
+ if (2 * frame_header_.width < reference_frame->upscaled_width() ||
+ 2 * frame_header_.height < reference_frame->frame_height() ||
+ frame_header_.width > 16 * reference_frame->upscaled_width() ||
+ frame_header_.height > 16 * reference_frame->frame_height()) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid inter frame size: width=%d, height=%d. Reference "
+ "frame: index=%d, upscaled width=%d, height=%d.",
+ frame_header_.width, frame_header_.height, index,
+ reference_frame->upscaled_width(),
+ reference_frame->frame_height());
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseReferenceOrderHint() {
+ if (!frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_order_hint) {
+ return true;
+ }
+ int64_t scratch;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+ frame_header_.reference_order_hint[i] = scratch;
+ if (frame_header_.reference_order_hint[i] !=
+ decoder_state_.reference_order_hint[i]) {
+ decoder_state_.reference_frame[i] = nullptr;
+ }
+ }
+ return true;
+}
+
+// static
+int ObuParser::FindLatestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int latest_order_hint = INT_MIN;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint >= current_frame_hint &&
+ hint >= latest_order_hint) {
+ ref = i;
+ latest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindEarliestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int earliest_order_hint = INT_MAX;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint >= current_frame_hint &&
+ hint < earliest_order_hint) {
+ ref = i;
+ earliest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindLatestForwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+ int ref = -1;
+ int latest_order_hint = INT_MIN;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (!used_frame[i] && hint < current_frame_hint &&
+ hint >= latest_order_hint) {
+ ref = i;
+ latest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// static
+int ObuParser::FindReferenceWithSmallestOutputOrder(
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints) {
+ int ref = -1;
+ int earliest_order_hint = INT_MAX;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int hint = shifted_order_hints[i];
+ if (hint < earliest_order_hint) {
+ ref = i;
+ earliest_order_hint = hint;
+ }
+ }
+ return ref;
+}
+
+// Computes the elements in the frame_header_.reference_frame_index array
+// based on:
+// * the syntax elements last_frame_idx and gold_frame_idx, and
+// * the values stored within the decoder_state_.reference_order_hint array
+// (these values represent the least significant bits of the expected output
+// order of the frames).
+//
+// Frame type: {
+// libgav1_name spec_name int
+// kReferenceFrameLast, LAST_FRAME 1
+// kReferenceFrameLast2, LAST2_FRAME 2
+// kReferenceFrameLast3, LAST3_FRAME 3
+// kReferenceFrameGolden, GOLDEN_FRAME 4
+// kReferenceFrameBackward, BWDREF_FRAME 5
+// kReferenceFrameAlternate2, ALTREF2_FRAME 6
+// kReferenceFrameAlternate, ALTREF_FRAME 7
+// }
+//
+// A typical case of a group of pictures (frames) in display order:
+// (However, more complex cases are possibly allowed in terms of
+// bitstream conformance.)
+//
+// | | | | | | | |
+// | | | | | | | |
+// | | | | | | | |
+// | | | | | | | |
+//
+// 4 3 2 1 current_frame 5 6 7
+//
+bool ObuParser::SetFrameReferences(const int8_t last_frame_idx,
+ const int8_t gold_frame_idx) {
+ // Set the ref_frame_idx entries for kReferenceFrameLast and
+ // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize
+ // the other entries to -1.
+ for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+ reference_frame_index = -1;
+ }
+ frame_header_
+ .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] =
+ last_frame_idx;
+ frame_header_
+ .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] =
+ gold_frame_idx;
+
+ // used_frame records which reference frames have been used.
+ std::array<bool, kNumReferenceFrameTypes> used_frame;
+ used_frame.fill(false);
+ used_frame[last_frame_idx] = true;
+ used_frame[gold_frame_idx] = true;
+
+ assert(sequence_header_.order_hint_bits >= 1);
+ const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1);
+ // shifted_order_hints contains the expected output order shifted such that
+ // the current frame has hint equal to current_frame_hint.
+ std::array<int, kNumReferenceFrameTypes> shifted_order_hints;
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ const int relative_distance = GetRelativeDistance(
+ decoder_state_.reference_order_hint[i], frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ shifted_order_hints[i] = current_frame_hint + relative_distance;
+ }
+
+ // The expected output orders for kReferenceFrameLast and
+ // kReferenceFrameGolden.
+ const int last_order_hint = shifted_order_hints[last_frame_idx];
+ const int gold_order_hint = shifted_order_hints[gold_frame_idx];
+
+ // Section 7.8: It is a requirement of bitstream conformance that
+ // lastOrderHint and goldOrderHint are strictly less than curFrameHint.
+ if (last_order_hint >= current_frame_hint ||
+ gold_order_hint >= current_frame_hint) {
+ return false;
+ }
+
+ // Find a backward reference to the frame with highest output order. If
+ // found, set the kReferenceFrameAlternate reference to that backward
+ // reference.
+ int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_
+ .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+
+ // Find a backward reference to the closest frame. If found, set the
+ // kReferenceFrameBackward reference to that backward reference.
+ ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_
+ .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+
+ // Set the kReferenceFrameAlternate2 reference to the next closest backward
+ // reference.
+ ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_.reference_frame_index[kReferenceFrameAlternate2 -
+ kReferenceFrameLast] = ref;
+ used_frame[ref] = true;
+ }
+
+ // The remaining references are set to be forward references in
+ // reverse chronological order.
+ static constexpr ReferenceFrameType
+ kRefFrameList[kNumInterReferenceFrameTypes - 2] = {
+ kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward,
+ kReferenceFrameAlternate2, kReferenceFrameAlternate};
+ for (const ReferenceFrameType ref_frame : kRefFrameList) {
+ if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] <
+ 0) {
+ ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints,
+ used_frame);
+ if (ref >= 0) {
+ frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] =
+ ref;
+ used_frame[ref] = true;
+ }
+ }
+ }
+
+ // Finally, any remaining references are set to the reference frame with
+ // smallest output order.
+ ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints);
+ assert(ref >= 0);
+ for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+ if (reference_frame_index < 0) {
+ reference_frame_index = ref;
+ }
+ }
+
+ return true;
+}
+
+bool ObuParser::ParseLoopFilterParameters() {
+ LoopFilter* const loop_filter = &frame_header_.loop_filter;
+ if (frame_header_.coded_lossless || frame_header_.allow_intrabc) {
+ SetDefaultRefDeltas(loop_filter);
+ return true;
+ }
+ // IsIntraFrame implies kPrimaryReferenceNone.
+ assert(!IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.primary_reference_frame == kPrimaryReferenceNone);
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ // Part of the setup_past_independence() function in the spec. It is not
+ // necessary to set loop_filter->delta_enabled to true. See
+ // https://crbug.com/aomedia/2305.
+ SetDefaultRefDeltas(loop_filter);
+ } else {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ const RefCountedBuffer* prev_frame =
+ decoder_state_.reference_frame[prev_frame_index].get();
+ loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas();
+ loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas();
+ }
+ int64_t scratch;
+ for (int i = 0; i < 2; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(6);
+ loop_filter->level[i] = scratch;
+ }
+ if (!sequence_header_.color_config.is_monochrome &&
+ (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) {
+ for (int i = 2; i < 4; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(6);
+ loop_filter->level[i] = scratch;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(3);
+ loop_filter->sharpness = scratch;
+ OBU_READ_BIT_OR_FAIL;
+ loop_filter->delta_enabled = scratch != 0;
+ if (loop_filter->delta_enabled) {
+ OBU_READ_BIT_OR_FAIL;
+ loop_filter->delta_update = scratch != 0;
+ if (loop_filter->delta_update) {
+ for (auto& ref_delta : loop_filter->ref_deltas) {
+ OBU_READ_BIT_OR_FAIL;
+ const bool update_ref_delta = scratch != 0;
+ if (update_ref_delta) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ref_delta = scratch_int;
+ }
+ }
+ for (auto& mode_delta : loop_filter->mode_deltas) {
+ OBU_READ_BIT_OR_FAIL;
+ const bool update_mode_delta = scratch != 0;
+ if (update_mode_delta) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ mode_delta = scratch_int;
+ }
+ }
+ }
+ } else {
+ loop_filter->delta_update = false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) {
+ int64_t scratch;
+ *delta = 0;
+ OBU_READ_BIT_OR_FAIL;
+ const bool delta_coded = scratch != 0;
+ if (delta_coded) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ *delta = scratch_int;
+ }
+ return true;
+}
+
+bool ObuParser::ParseQuantizerParameters() {
+ int64_t scratch;
+ QuantizerParameters* const quantizer = &frame_header_.quantizer;
+ OBU_READ_LITERAL_OR_FAIL(8);
+ quantizer->base_index = scratch;
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false;
+ if (!sequence_header_.color_config.is_monochrome) {
+ bool diff_uv_delta = false;
+ if (sequence_header_.color_config.separate_uv_delta_q) {
+ OBU_READ_BIT_OR_FAIL;
+ diff_uv_delta = scratch != 0;
+ }
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) ||
+ !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) {
+ return false;
+ }
+ if (diff_uv_delta) {
+ if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) ||
+ !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) {
+ return false;
+ }
+ } else {
+ quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU];
+ quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU];
+ }
+ }
+ OBU_READ_BIT_OR_FAIL;
+ quantizer->use_matrix = scratch != 0;
+ if (quantizer->use_matrix) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneY] = scratch;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneU] = scratch;
+ if (sequence_header_.color_config.separate_uv_delta_q) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ quantizer->matrix_level[kPlaneV] = scratch;
+ } else {
+ quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU];
+ }
+ }
+ return true;
+}
+
+// This method implements the following functions in the spec:
+// - segmentation_params()
+// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled
+// arrays to all 0.
+// - part of load_previous(): Call load_segmentation_params().
+//
+// A careful analysis of the spec shows the part of setup_past_independence()
+// can be optimized away and the part of load_previous() only needs to be
+// invoked under a specific condition. Although the logic looks different from
+// the spec, it is equivalent and more efficient.
+bool ObuParser::ParseSegmentationParameters() {
+ int64_t scratch;
+ Segmentation* const segmentation = &frame_header_.segmentation;
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->enabled = scratch != 0;
+ if (!segmentation->enabled) return true;
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ segmentation->update_map = true;
+ segmentation->update_data = true;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->update_map = scratch != 0;
+ if (segmentation->update_map) {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->temporal_update = scratch != 0;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->update_data = scratch != 0;
+ if (!segmentation->update_data) {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ decoder_state_.reference_frame[prev_frame_index]
+ ->GetSegmentationParameters(segmentation);
+ return true;
+ }
+ }
+ for (int8_t i = 0; i < kMaxSegments; ++i) {
+ for (int8_t j = 0; j < kSegmentFeatureMax; ++j) {
+ OBU_READ_BIT_OR_FAIL;
+ segmentation->feature_enabled[i][j] = scratch != 0;
+ if (segmentation->feature_enabled[i][j]) {
+ if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+ int scratch_int;
+ if (!bit_reader_->ReadInverseSignedLiteral(
+ kSegmentationFeatureBits[j], &scratch_int)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ segmentation->feature_data[i][j] =
+ Clip3(scratch_int, -kSegmentationFeatureMaxValues[j],
+ kSegmentationFeatureMaxValues[j]);
+ } else {
+ if (kSegmentationFeatureBits[j] > 0) {
+ OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]);
+ segmentation->feature_data[i][j] = Clip3(
+ static_cast<int>(scratch), 0, kSegmentationFeatureMaxValues[j]);
+ } else {
+ segmentation->feature_data[i][j] = 0;
+ }
+ }
+ segmentation->last_active_segment_id = i;
+ if (j >= kSegmentFeatureReferenceFrame) {
+ segmentation->segment_id_pre_skip = true;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseQuantizerIndexDeltaParameters() {
+ int64_t scratch;
+ if (frame_header_.quantizer.base_index > 0) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_q.present = scratch != 0;
+ if (frame_header_.delta_q.present) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.delta_q.scale = scratch;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseLoopFilterDeltaParameters() {
+ int64_t scratch;
+ if (frame_header_.delta_q.present) {
+ if (!frame_header_.allow_intrabc) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_lf.present = scratch != 0;
+ }
+ if (frame_header_.delta_lf.present) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.delta_lf.scale = scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.delta_lf.multi = scratch != 0;
+ }
+ }
+ return true;
+}
+
+void ObuParser::ComputeSegmentLosslessAndQIndex() {
+ frame_header_.coded_lossless = true;
+ Segmentation* const segmentation = &frame_header_.segmentation;
+ const QuantizerParameters* const quantizer = &frame_header_.quantizer;
+ for (int i = 0; i < kMaxSegments; ++i) {
+ segmentation->qindex[i] =
+ GetQIndex(*segmentation, i, quantizer->base_index);
+ segmentation->lossless[i] =
+ segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 &&
+ quantizer->delta_dc[kPlaneU] == 0 &&
+ quantizer->delta_ac[kPlaneU] == 0 &&
+ quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0;
+ if (!segmentation->lossless[i]) frame_header_.coded_lossless = false;
+ // The spec calls for setting up a two-dimensional SegQMLevel array here.
+ // We avoid the SegQMLevel array by using segmentation->lossless[i] and
+ // quantizer->matrix_level[plane] directly in the reconstruct process of
+ // Section 7.12.3.
+ }
+ frame_header_.upscaled_lossless =
+ frame_header_.coded_lossless &&
+ frame_header_.width == frame_header_.upscaled_width;
+}
+
+bool ObuParser::ParseCdefParameters() {
+ const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
+ if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
+ !sequence_header_.enable_cdef) {
+ frame_header_.cdef.damping = 3 + coeff_shift;
+ return true;
+ }
+ Cdef* const cdef = &frame_header_.cdef;
+ int64_t scratch;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->damping = scratch + 3 + coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->bits = scratch;
+ for (int i = 0; i < (1 << cdef->bits); ++i) {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ cdef->y_primary_strength[i] = scratch << coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->y_secondary_strength[i] = scratch;
+ if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+ cdef->y_secondary_strength[i] <<= coeff_shift;
+ if (sequence_header_.color_config.is_monochrome) continue;
+ OBU_READ_LITERAL_OR_FAIL(4);
+ cdef->uv_primary_strength[i] = scratch << coeff_shift;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ cdef->uv_secondary_strength[i] = scratch;
+ if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+ cdef->uv_secondary_strength[i] <<= coeff_shift;
+ }
+ return true;
+}
+
+bool ObuParser::ParseLoopRestorationParameters() {
+ if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc ||
+ !sequence_header_.enable_restoration) {
+ return true;
+ }
+ int64_t scratch;
+ bool uses_loop_restoration = false;
+ bool uses_chroma_loop_restoration = false;
+ LoopRestoration* const loop_restoration = &frame_header_.loop_restoration;
+ const int num_planes = sequence_header_.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ for (int i = 0; i < num_planes; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ loop_restoration->type[i] = static_cast<LoopRestorationType>(scratch);
+ if (loop_restoration->type[i] != kLoopRestorationTypeNone) {
+ uses_loop_restoration = true;
+ if (i > 0) uses_chroma_loop_restoration = true;
+ }
+ }
+ if (uses_loop_restoration) {
+ uint8_t unit_shift;
+ if (sequence_header_.use_128x128_superblock) {
+ OBU_READ_BIT_OR_FAIL;
+ unit_shift = scratch + 1;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ unit_shift = scratch;
+ if (unit_shift != 0) {
+ OBU_READ_BIT_OR_FAIL;
+ const uint8_t unit_extra_shift = scratch;
+ unit_shift += unit_extra_shift;
+ }
+ }
+ loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
+ uint8_t uv_shift = 0;
+ if (sequence_header_.color_config.subsampling_x != 0 &&
+ sequence_header_.color_config.subsampling_y != 0 &&
+ uses_chroma_loop_restoration) {
+ OBU_READ_BIT_OR_FAIL;
+ uv_shift = scratch;
+ }
+ loop_restoration->unit_size_log2[kPlaneU] =
+ loop_restoration->unit_size_log2[kPlaneV] =
+ loop_restoration->unit_size_log2[0] - uv_shift;
+ }
+ return true;
+}
+
+bool ObuParser::ParseTxModeSyntax() {
+ if (frame_header_.coded_lossless) {
+ frame_header_.tx_mode = kTxModeOnly4x4;
+ return true;
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest;
+ return true;
+}
+
+bool ObuParser::ParseFrameReferenceModeSyntax() {
+ int64_t scratch;
+ if (!IsIntraFrame(frame_header_.frame_type)) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.reference_mode_select = scratch != 0;
+ }
+ return true;
+}
+
+bool ObuParser::IsSkipModeAllowed() {
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ !frame_header_.reference_mode_select ||
+ !sequence_header_.enable_order_hint) {
+ return false;
+ }
+ // Identify the nearest forward and backward references.
+ int forward_index = -1;
+ int backward_index = -1;
+ int forward_hint = -1;
+ int backward_hint = -1;
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ const unsigned int reference_hint =
+ decoder_state_
+ .reference_order_hint[frame_header_.reference_frame_index[i]];
+ // TODO(linfengz): |relative_distance| equals
+ // current_frame_->reference_info()->
+ // relative_distance_from[i + kReferenceFrameLast];
+ // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+ // Will figure out how to initialize |current_frame_.reference_info_| in the
+ // RefCountedBuffer later.
+ const int relative_distance =
+ GetRelativeDistance(reference_hint, frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ if (relative_distance < 0) {
+ if (forward_index < 0 ||
+ GetRelativeDistance(reference_hint, forward_hint,
+ sequence_header_.order_hint_shift_bits) > 0) {
+ forward_index = i;
+ forward_hint = reference_hint;
+ }
+ } else if (relative_distance > 0) {
+ if (backward_index < 0 ||
+ GetRelativeDistance(reference_hint, backward_hint,
+ sequence_header_.order_hint_shift_bits) < 0) {
+ backward_index = i;
+ backward_hint = reference_hint;
+ }
+ }
+ }
+ if (forward_index < 0) return false;
+ if (backward_index >= 0) {
+ // Bidirectional prediction.
+ frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::min(forward_index, backward_index));
+ frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::max(forward_index, backward_index));
+ return true;
+ }
+ // Forward prediction only. Identify the second nearest forward reference.
+ int second_forward_index = -1;
+ int second_forward_hint = -1;
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ const unsigned int reference_hint =
+ decoder_state_
+ .reference_order_hint[frame_header_.reference_frame_index[i]];
+ if (GetRelativeDistance(reference_hint, forward_hint,
+ sequence_header_.order_hint_shift_bits) < 0) {
+ if (second_forward_index < 0 ||
+ GetRelativeDistance(reference_hint, second_forward_hint,
+ sequence_header_.order_hint_shift_bits) > 0) {
+ second_forward_index = i;
+ second_forward_hint = reference_hint;
+ }
+ }
+ }
+ if (second_forward_index < 0) return false;
+ frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::min(forward_index, second_forward_index));
+ frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+ kReferenceFrameLast + std::max(forward_index, second_forward_index));
+ return true;
+}
+
+bool ObuParser::ParseSkipModeParameters() {
+ if (!IsSkipModeAllowed()) return true;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.skip_mode_present = scratch != 0;
+ return true;
+}
+
+// Sets frame_header_.global_motion[ref].params[index].
+bool ObuParser::ParseGlobalParamSyntax(
+ int ref, int index,
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+ prev_global_motions) {
+ GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+ const GlobalMotion* const prev_global_motion = &prev_global_motions[ref];
+ int abs_bits = kGlobalMotionAlphaBits;
+ int precision_bits = kGlobalMotionAlphaPrecisionBits;
+ if (index < 2) {
+ if (global_motion->type == kGlobalMotionTransformationTypeTranslation) {
+ const auto high_precision_mv_factor =
+ static_cast<int>(!frame_header_.allow_high_precision_mv);
+ abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor;
+ precision_bits =
+ kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor;
+ } else {
+ abs_bits = kGlobalMotionTranslationBits;
+ precision_bits = kGlobalMotionTranslationPrecisionBits;
+ }
+ }
+ const int precision_diff = kWarpedModelPrecisionBits - precision_bits;
+ const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+ const int sub = (index % 3 == 2) ? 1 << precision_bits : 0;
+ const int mx = 1 << abs_bits;
+ const int reference =
+ (prev_global_motion->params[index] >> precision_diff) - sub;
+ int scratch;
+ if (!bit_reader_->DecodeSignedSubexpWithReference(
+ -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ global_motion->params[index] = LeftShift(scratch, precision_diff) + round;
+ return true;
+}
+
+bool ObuParser::ParseGlobalMotionParameters() {
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ frame_header_.global_motion[ref].type =
+ kGlobalMotionTransformationTypeIdentity;
+ for (int i = 0; i < 6; ++i) {
+ frame_header_.global_motion[ref].params[i] =
+ (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+ }
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) return true;
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>* prev_global_motions =
+ nullptr;
+ if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+ // Part of the setup_past_independence() function in the spec. The value
+ // that the spec says PrevGmParams[ref][i] should be set to is exactly
+ // the value frame_header_.global_motion[ref].params[i] is set to by the
+ // for loop above. Therefore prev_global_motions can simply point to
+ // frame_header_.global_motion.
+ prev_global_motions = &frame_header_.global_motion;
+ } else {
+ // Part of the load_previous() function in the spec.
+ const int prev_frame_index =
+ frame_header_
+ .reference_frame_index[frame_header_.primary_reference_frame];
+ prev_global_motions =
+ &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions();
+ }
+ for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+ GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ const bool is_global = scratch != 0;
+ if (is_global) {
+ OBU_READ_BIT_OR_FAIL;
+ const bool is_rot_zoom = scratch != 0;
+ if (is_rot_zoom) {
+ global_motion->type = kGlobalMotionTransformationTypeRotZoom;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ const bool is_translation = scratch != 0;
+ global_motion->type = is_translation
+ ? kGlobalMotionTransformationTypeTranslation
+ : kGlobalMotionTransformationTypeAffine;
+ }
+ } else {
+ global_motion->type = kGlobalMotionTransformationTypeIdentity;
+ }
+ if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) {
+ if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) {
+ return false;
+ }
+ if (global_motion->type == kGlobalMotionTransformationTypeAffine) {
+ if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) {
+ return false;
+ }
+ } else {
+ global_motion->params[4] = -global_motion->params[3];
+ global_motion->params[5] = global_motion->params[2];
+ }
+ }
+ if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) {
+ if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) ||
+ !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseFilmGrainParameters() {
+ if (!sequence_header_.film_grain_params_present ||
+ (!frame_header_.show_frame && !frame_header_.showable_frame)) {
+ // frame_header_.film_grain_params is already zero-initialized.
+ return true;
+ }
+
+ FilmGrainParams& film_grain_params = frame_header_.film_grain_params;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.apply_grain = scratch != 0;
+ if (!film_grain_params.apply_grain) {
+ // film_grain_params is already zero-initialized.
+ return true;
+ }
+
+ OBU_READ_LITERAL_OR_FAIL(16);
+ film_grain_params.grain_seed = static_cast<int>(scratch);
+ film_grain_params.update_grain = true;
+ if (frame_header_.frame_type == kFrameInter) {
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.update_grain = scratch != 0;
+ }
+ if (!film_grain_params.update_grain) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ film_grain_params.reference_index = static_cast<int>(scratch);
+ bool found = false;
+ for (const auto index : frame_header_.reference_frame_index) {
+ if (film_grain_params.reference_index == index) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ static_assert(sizeof(frame_header_.reference_frame_index) /
+ sizeof(frame_header_.reference_frame_index[0]) ==
+ 7,
+ "");
+ LIBGAV1_DLOG(ERROR,
+ "Invalid value for film_grain_params_ref_idx (%d). "
+ "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}",
+ film_grain_params.reference_index,
+ frame_header_.reference_frame_index[0],
+ frame_header_.reference_frame_index[1],
+ frame_header_.reference_frame_index[2],
+ frame_header_.reference_frame_index[3],
+ frame_header_.reference_frame_index[4],
+ frame_header_.reference_frame_index[5],
+ frame_header_.reference_frame_index[6]);
+ return false;
+ }
+ const RefCountedBuffer* grain_params_reference_frame =
+ decoder_state_.reference_frame[film_grain_params.reference_index].get();
+ if (grain_params_reference_frame == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+ film_grain_params.reference_index);
+ return false;
+ }
+ const int temp_grain_seed = film_grain_params.grain_seed;
+ const bool temp_update_grain = film_grain_params.update_grain;
+ const int temp_reference_index = film_grain_params.reference_index;
+ film_grain_params = grain_params_reference_frame->film_grain_params();
+ film_grain_params.grain_seed = temp_grain_seed;
+ film_grain_params.update_grain = temp_update_grain;
+ film_grain_params.reference_index = temp_reference_index;
+ return true;
+ }
+
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_y_points = scratch;
+ if (film_grain_params.num_y_points > 14) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).",
+ film_grain_params.num_y_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_y_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_y_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_y_value[i - 1] >=
+ film_grain_params.point_y_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).",
+ i - 1, film_grain_params.point_y_value[i - 1], i,
+ film_grain_params.point_y_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_y_scaling[i] = scratch;
+ }
+ if (sequence_header_.color_config.is_monochrome) {
+ film_grain_params.chroma_scaling_from_luma = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.chroma_scaling_from_luma = scratch != 0;
+ }
+ if (sequence_header_.color_config.is_monochrome ||
+ film_grain_params.chroma_scaling_from_luma ||
+ (sequence_header_.color_config.subsampling_x == 1 &&
+ sequence_header_.color_config.subsampling_y == 1 &&
+ film_grain_params.num_y_points == 0)) {
+ film_grain_params.num_u_points = 0;
+ film_grain_params.num_v_points = 0;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_u_points = scratch;
+ if (film_grain_params.num_u_points > 10) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).",
+ film_grain_params.num_u_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_u_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_u_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_u_value[i - 1] >=
+ film_grain_params.point_u_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).",
+ i - 1, film_grain_params.point_u_value[i - 1], i,
+ film_grain_params.point_u_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_u_scaling[i] = scratch;
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ film_grain_params.num_v_points = scratch;
+ if (film_grain_params.num_v_points > 10) {
+ LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).",
+ film_grain_params.num_v_points);
+ return false;
+ }
+ if (sequence_header_.color_config.subsampling_x == 1 &&
+ sequence_header_.color_config.subsampling_y == 1 &&
+ (film_grain_params.num_u_points == 0) !=
+ (film_grain_params.num_v_points == 0)) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid values for num_u_points (%d) and num_v_points (%d) "
+ "for 4:2:0 chroma subsampling.",
+ film_grain_params.num_u_points,
+ film_grain_params.num_v_points);
+ return false;
+ }
+ for (int i = 0; i < film_grain_params.num_v_points; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_v_value[i] = scratch;
+ if (i != 0 && film_grain_params.point_v_value[i - 1] >=
+ film_grain_params.point_v_value[i]) {
+ LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).",
+ i - 1, film_grain_params.point_v_value[i - 1], i,
+ film_grain_params.point_v_value[i]);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.point_v_scaling[i] = scratch;
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.chroma_scaling = scratch + 8;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.auto_regression_coeff_lag = scratch;
+
+ const int num_pos_y =
+ MultiplyBy2(film_grain_params.auto_regression_coeff_lag) *
+ (film_grain_params.auto_regression_coeff_lag + 1);
+ int num_pos_uv = num_pos_y;
+ if (film_grain_params.num_y_points > 0) {
+ ++num_pos_uv;
+ for (int i = 0; i < num_pos_y; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_y[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ if (film_grain_params.chroma_scaling_from_luma ||
+ film_grain_params.num_u_points > 0) {
+ for (int i = 0; i < num_pos_uv; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_u[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ if (film_grain_params.chroma_scaling_from_luma ||
+ film_grain_params.num_v_points > 0) {
+ for (int i = 0; i < num_pos_uv; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.auto_regression_coeff_v[i] =
+ static_cast<int8_t>(scratch - 128);
+ }
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.auto_regression_shift = static_cast<uint8_t>(scratch + 6);
+ OBU_READ_LITERAL_OR_FAIL(2);
+ film_grain_params.grain_scale_shift = static_cast<int>(scratch);
+ if (film_grain_params.num_u_points > 0) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.u_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.u_luma_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(9);
+ film_grain_params.u_offset = static_cast<int16_t>(scratch - 256);
+ }
+ if (film_grain_params.num_v_points > 0) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.v_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(8);
+ film_grain_params.v_luma_multiplier = static_cast<int8_t>(scratch - 128);
+ OBU_READ_LITERAL_OR_FAIL(9);
+ film_grain_params.v_offset = static_cast<int16_t>(scratch - 256);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.overlap_flag = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ film_grain_params.clip_to_restricted_range = scratch != 0;
+ return true;
+}
+
+bool ObuParser::ParseTileInfoSyntax() {
+ TileInfo* const tile_info = &frame_header_.tile_info;
+ const int sb_columns = sequence_header_.use_128x128_superblock
+ ? ((frame_header_.columns4x4 + 31) >> 5)
+ : ((frame_header_.columns4x4 + 15) >> 4);
+ const int sb_rows = sequence_header_.use_128x128_superblock
+ ? ((frame_header_.rows4x4 + 31) >> 5)
+ : ((frame_header_.rows4x4 + 15) >> 4);
+ tile_info->sb_columns = sb_columns;
+ tile_info->sb_rows = sb_rows;
+ const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4;
+ const int sb_size = 2 + sb_shift;
+ const int sb_max_tile_width = kMaxTileWidth >> sb_size;
+ const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size);
+ const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns);
+ const int maxlog2_tile_columns =
+ CeilLog2(std::min(sb_columns, static_cast<int>(kMaxTileColumns)));
+ const int maxlog2_tile_rows =
+ CeilLog2(std::min(sb_rows, static_cast<int>(kMaxTileRows)));
+ const int min_log2_tiles = std::max(
+ minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns));
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ tile_info->uniform_spacing = scratch != 0;
+ if (tile_info->uniform_spacing) {
+ // Read tile columns.
+ tile_info->tile_columns_log2 = minlog2_tile_columns;
+ while (tile_info->tile_columns_log2 < maxlog2_tile_columns) {
+ OBU_READ_BIT_OR_FAIL;
+ if (scratch == 0) break;
+ ++tile_info->tile_columns_log2;
+ }
+
+ // Compute tile column starts.
+ const int sb_tile_width =
+ (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >>
+ tile_info->tile_columns_log2;
+ if (sb_tile_width <= 0) return false;
+ int i = 0;
+ for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) {
+ if (i >= kMaxTileColumns) {
+ LIBGAV1_DLOG(ERROR,
+ "tile_columns would be greater than kMaxTileColumns.");
+ return false;
+ }
+ tile_info->tile_column_start[i++] = sb_start << sb_shift;
+ }
+ tile_info->tile_column_start[i] = frame_header_.columns4x4;
+ tile_info->tile_columns = i;
+
+ // Read tile rows.
+ const int minlog2_tile_rows =
+ std::max(min_log2_tiles - tile_info->tile_columns_log2, 0);
+ tile_info->tile_rows_log2 = minlog2_tile_rows;
+ while (tile_info->tile_rows_log2 < maxlog2_tile_rows) {
+ OBU_READ_BIT_OR_FAIL;
+ if (scratch == 0) break;
+ ++tile_info->tile_rows_log2;
+ }
+
+ // Compute tile row starts.
+ const int sb_tile_height =
+ (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >>
+ tile_info->tile_rows_log2;
+ if (sb_tile_height <= 0) return false;
+ i = 0;
+ for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) {
+ if (i >= kMaxTileRows) {
+ LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+ return false;
+ }
+ tile_info->tile_row_start[i++] = sb_start << sb_shift;
+ }
+ tile_info->tile_row_start[i] = frame_header_.rows4x4;
+ tile_info->tile_rows = i;
+ } else {
+ int widest_tile_sb = 1;
+ int i = 0;
+ for (int sb_start = 0; sb_start < sb_columns; ++i) {
+ if (i >= kMaxTileColumns) {
+ LIBGAV1_DLOG(ERROR,
+ "tile_columns would be greater than kMaxTileColumns.");
+ return false;
+ }
+ tile_info->tile_column_start[i] = sb_start << sb_shift;
+ const int max_width =
+ std::min(sb_columns - sb_start, static_cast<int>(sb_max_tile_width));
+ if (!bit_reader_->DecodeUniform(
+ max_width, &tile_info->tile_column_width_in_superblocks[i])) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ++tile_info->tile_column_width_in_superblocks[i];
+ widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i],
+ widest_tile_sb);
+ sb_start += tile_info->tile_column_width_in_superblocks[i];
+ }
+ tile_info->tile_column_start[i] = frame_header_.columns4x4;
+ tile_info->tile_columns = i;
+ tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns);
+
+ int max_tile_area_sb = sb_rows * sb_columns;
+ if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1;
+ const int max_tile_height_sb =
+ std::max(max_tile_area_sb / widest_tile_sb, 1);
+
+ i = 0;
+ for (int sb_start = 0; sb_start < sb_rows; ++i) {
+ if (i >= kMaxTileRows) {
+ LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+ return false;
+ }
+ tile_info->tile_row_start[i] = sb_start << sb_shift;
+ const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb);
+ if (!bit_reader_->DecodeUniform(
+ max_height, &tile_info->tile_row_height_in_superblocks[i])) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits.");
+ return false;
+ }
+ ++tile_info->tile_row_height_in_superblocks[i];
+ sb_start += tile_info->tile_row_height_in_superblocks[i];
+ }
+ tile_info->tile_row_start[i] = frame_header_.rows4x4;
+ tile_info->tile_rows = i;
+ tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows);
+ }
+ tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns;
+ if (!tile_buffers_.reserve(tile_info->tile_count)) {
+ LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_.");
+ return false;
+ }
+ tile_info->context_update_id = 0;
+ const int tile_bits =
+ tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+ if (tile_bits != 0) {
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ tile_info->context_update_id = static_cast<int16_t>(scratch);
+ if (tile_info->context_update_id >= tile_info->tile_count) {
+ LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.",
+ tile_info->context_update_id, tile_info->tile_count);
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(2);
+ tile_info->tile_size_bytes = 1 + scratch;
+ }
+ return true;
+}
+
+bool ObuParser::ReadAllowWarpedMotion() {
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_warped_motion) {
+ return true;
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_warped_motion = scratch != 0;
+ return true;
+}
+
+bool ObuParser::ParseFrameParameters() {
+ int64_t scratch;
+ if (sequence_header_.reduced_still_picture_header) {
+ frame_header_.show_frame = true;
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.show_existing_frame = scratch != 0;
+ if (frame_header_.show_existing_frame) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.frame_to_show = scratch;
+ if (sequence_header_.decoder_model_info_present_flag &&
+ !sequence_header_.timing_info.equal_picture_interval) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.frame_presentation_time_length);
+ frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+ frame_header_.display_frame_id = static_cast<uint16_t>(scratch);
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // whenever display_frame_id is read, the value matches
+ // RefFrameId[ frame_to_show_map_idx ] ..., and that
+ // RefValid[ frame_to_show_map_idx ] is equal to 1.
+ //
+ // The current_frame_ == nullptr check below is equivalent to checking
+ // if RefValid[ frame_to_show_map_idx ] is equal to 1.
+ if (frame_header_.display_frame_id !=
+ decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
+ LIBGAV1_DLOG(ERROR,
+ "Reference buffer %d has a frame id number mismatch.",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ }
+ // Section 7.18.2. Note: This is also needed for Section 7.21 if
+ // frame_type is kFrameKey.
+ current_frame_ =
+ decoder_state_.reference_frame[frame_header_.frame_to_show];
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // when show_existing_frame is used to show a previous frame, that the
+ // value of showable_frame for the previous frame was equal to 1.
+ if (!current_frame_->showable_frame()) {
+ LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame",
+ frame_header_.frame_to_show);
+ return false;
+ }
+ if (current_frame_->frame_type() == kFrameKey) {
+ frame_header_.refresh_frame_flags = 0xff;
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // when show_existing_frame is used to show a previous frame with
+ // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that
+ // the frame is output via the show_existing_frame mechanism at most
+ // once.
+ current_frame_->set_showable_frame(false);
+
+ // Section 7.21. Note: decoder_state_.current_frame_id must be set
+ // only when frame_type is kFrameKey per the spec. Among all the
+ // variables set in Section 7.21, current_frame_id is the only one
+ // whose value lives across frames. (PrevFrameID is set equal to the
+ // current_frame_id value for the previous frame.)
+ decoder_state_.current_frame_id =
+ decoder_state_.reference_frame_id[frame_header_.frame_to_show];
+ decoder_state_.order_hint =
+ decoder_state_.reference_order_hint[frame_header_.frame_to_show];
+ }
+ return true;
+ }
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.frame_type = static_cast<FrameType>(scratch);
+ current_frame_->set_frame_type(frame_header_.frame_type);
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.show_frame = scratch != 0;
+ if (frame_header_.show_frame &&
+ sequence_header_.decoder_model_info_present_flag &&
+ !sequence_header_.timing_info.equal_picture_interval) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.frame_presentation_time_length);
+ frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+ }
+ if (frame_header_.show_frame) {
+ frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey);
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.showable_frame = scratch != 0;
+ }
+ current_frame_->set_showable_frame(frame_header_.showable_frame);
+ if (frame_header_.frame_type == kFrameSwitch ||
+ (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+ frame_header_.error_resilient_mode = true;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.error_resilient_mode = scratch != 0;
+ }
+ }
+ if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
+ decoder_state_.reference_order_hint.fill(0);
+ decoder_state_.reference_frame.fill(nullptr);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.enable_cdf_update = scratch == 0;
+ if (sequence_header_.force_screen_content_tools ==
+ kSelectScreenContentTools) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_screen_content_tools = scratch != 0;
+ } else {
+ frame_header_.allow_screen_content_tools =
+ sequence_header_.force_screen_content_tools != 0;
+ }
+ if (frame_header_.allow_screen_content_tools) {
+ if (sequence_header_.force_integer_mv == kSelectIntegerMv) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.force_integer_mv = scratch;
+ } else {
+ frame_header_.force_integer_mv = sequence_header_.force_integer_mv;
+ }
+ } else {
+ frame_header_.force_integer_mv = 0;
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) {
+ frame_header_.force_integer_mv = 1;
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+ frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
+ const int previous_frame_id = decoder_state_.current_frame_id;
+ decoder_state_.current_frame_id = frame_header_.current_frame_id;
+ if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+ if (previous_frame_id >= 0) {
+ // Section 6.8.2: ..., it is a requirement of bitstream conformance
+ // that all of the following conditions are true:
+ // * current_frame_id is not equal to PrevFrameID,
+ // * DiffFrameID is less than 1 << ( idLen - 1 )
+ int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+ const int id_length_max_value =
+ 1 << sequence_header_.frame_id_length_bits;
+ if (diff_frame_id <= 0) {
+ diff_frame_id += id_length_max_value;
+ }
+ if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+ LIBGAV1_DLOG(ERROR,
+ "current_frame_id (%d) equals or differs too much from "
+ "previous_frame_id (%d).",
+ decoder_state_.current_frame_id, previous_frame_id);
+ return false;
+ }
+ }
+ MarkInvalidReferenceFrames();
+ }
+ } else {
+ frame_header_.current_frame_id = 0;
+ decoder_state_.current_frame_id = frame_header_.current_frame_id;
+ }
+ if (frame_header_.frame_type == kFrameSwitch) {
+ frame_header_.frame_size_override_flag = true;
+ } else if (!sequence_header_.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.frame_size_override_flag = scratch != 0;
+ }
+ if (sequence_header_.order_hint_bits > 0) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+ frame_header_.order_hint = scratch;
+ }
+ decoder_state_.order_hint = frame_header_.order_hint;
+ if (IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.error_resilient_mode) {
+ frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.primary_reference_frame = scratch;
+ }
+ if (sequence_header_.decoder_model_info_present_flag) {
+ OBU_READ_BIT_OR_FAIL;
+ const bool buffer_removal_time_present = scratch != 0;
+ if (buffer_removal_time_present) {
+ for (int i = 0; i < sequence_header_.operating_points; ++i) {
+ if (!sequence_header_.decoder_model_present_for_operating_point[i]) {
+ continue;
+ }
+ const int index = sequence_header_.operating_point_idc[i];
+ if (index == 0 ||
+ (InTemporalLayer(index, obu_headers_.back().temporal_id) &&
+ InSpatialLayer(index, obu_headers_.back().spatial_id))) {
+ OBU_READ_LITERAL_OR_FAIL(
+ sequence_header_.decoder_model_info.buffer_removal_time_length);
+ frame_header_.buffer_removal_time[i] = static_cast<uint32_t>(scratch);
+ }
+ }
+ }
+ }
+ if (frame_header_.frame_type == kFrameSwitch ||
+ (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+ frame_header_.refresh_frame_flags = 0xff;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ frame_header_.refresh_frame_flags = scratch;
+ // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a
+ // requirement of bitstream conformance that refresh_frame_flags is not
+ // equal to 0xff.
+ if (frame_header_.frame_type == kFrameIntraOnly &&
+ frame_header_.refresh_frame_flags == 0xff) {
+ LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF.");
+ return false;
+ }
+ }
+ if ((!IsIntraFrame(frame_header_.frame_type) ||
+ frame_header_.refresh_frame_flags != 0xff) &&
+ !ParseReferenceOrderHint()) {
+ return false;
+ }
+ if (IsIntraFrame(frame_header_.frame_type)) {
+ if (!ParseFrameSizeAndRenderSize()) return false;
+ if (frame_header_.allow_screen_content_tools &&
+ frame_header_.width == frame_header_.upscaled_width) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_intrabc = scratch != 0;
+ }
+ } else {
+ if (!sequence_header_.enable_order_hint) {
+ frame_header_.frame_refs_short_signaling = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.frame_refs_short_signaling = scratch != 0;
+ if (frame_header_.frame_refs_short_signaling) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const int8_t last_frame_idx = scratch;
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const int8_t gold_frame_idx = scratch;
+ if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) {
+ return false;
+ }
+ }
+ }
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ if (!frame_header_.frame_refs_short_signaling) {
+ OBU_READ_LITERAL_OR_FAIL(3);
+ frame_header_.reference_frame_index[i] = scratch;
+ }
+ const int reference_frame_index = frame_header_.reference_frame_index[i];
+ assert(reference_frame_index >= 0);
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ...
+ // The remainder of the statement is handled by ParseSequenceHeader().
+ // Note if support for Annex C: Error resilience behavior is added this
+ // check should be omitted per C.5 Decoder consequences of processable
+ // frames.
+ if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
+ LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
+ reference_frame_index);
+ return false;
+ }
+ if (sequence_header_.frame_id_numbers_present) {
+ OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits);
+ const int delta_frame_id = static_cast<int>(1 + scratch);
+ const int id_length_max_value =
+ 1 << sequence_header_.frame_id_length_bits;
+ frame_header_.expected_frame_id[i] =
+ (frame_header_.current_frame_id + id_length_max_value -
+ delta_frame_id) %
+ id_length_max_value;
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // whenever expectedFrameId[ i ] is calculated, the value matches
+ // RefFrameId[ ref_frame_idx[ i ] ] ...
+ if (frame_header_.expected_frame_id[i] !=
+ decoder_state_.reference_frame_id[reference_frame_index]) {
+ LIBGAV1_DLOG(ERROR,
+ "Reference buffer %d has a frame id number mismatch.",
+ reference_frame_index);
+ return false;
+ }
+ }
+ }
+ if (frame_header_.frame_size_override_flag &&
+ !frame_header_.error_resilient_mode) {
+ // Section 5.9.7.
+ for (int index : frame_header_.reference_frame_index) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.found_reference = scratch != 0;
+ if (frame_header_.found_reference) {
+ const RefCountedBuffer* reference_frame =
+ decoder_state_.reference_frame[index].get();
+ // frame_header_.upscaled_width will be set in the
+ // ParseSuperResParametersAndComputeImageSize() call below.
+ frame_header_.width = reference_frame->upscaled_width();
+ frame_header_.height = reference_frame->frame_height();
+ frame_header_.render_width = reference_frame->render_width();
+ frame_header_.render_height = reference_frame->render_height();
+ if (!ParseSuperResParametersAndComputeImageSize()) return false;
+ break;
+ }
+ }
+ if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) {
+ return false;
+ }
+ } else {
+ if (!ParseFrameSizeAndRenderSize()) return false;
+ }
+ if (!ValidateInterFrameSize()) return false;
+ if (frame_header_.force_integer_mv != 0) {
+ frame_header_.allow_high_precision_mv = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.allow_high_precision_mv = scratch != 0;
+ }
+ OBU_READ_BIT_OR_FAIL;
+ const bool is_filter_switchable = scratch != 0;
+ if (is_filter_switchable) {
+ frame_header_.interpolation_filter = kInterpolationFilterSwitchable;
+ } else {
+ OBU_READ_LITERAL_OR_FAIL(2);
+ frame_header_.interpolation_filter =
+ static_cast<InterpolationFilter>(scratch);
+ }
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.is_motion_mode_switchable = scratch != 0;
+ if (frame_header_.error_resilient_mode ||
+ !sequence_header_.enable_ref_frame_mvs) {
+ frame_header_.use_ref_frame_mvs = false;
+ } else {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.use_ref_frame_mvs = scratch != 0;
+ }
+ }
+ // At this point, we have parsed the frame and render sizes and computed
+ // the image size, whether it's an intra or inter frame. So we can save
+ // the sizes in the current frame now.
+ if (!current_frame_->SetFrameDimensions(frame_header_)) {
+ LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed.");
+ return false;
+ }
+ if (!IsIntraFrame(frame_header_.frame_type)) {
+ // Initialize the kReferenceFrameIntra type reference frame information to
+ // simplify the frame type validation in motion field projection.
+ // Set the kReferenceFrameIntra type |order_hint_| to
+ // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+ // the other reference frame information of the kReferenceFrameIntra type
+ // could be correctly initialized using the following loop with
+ // |frame_header_.order_hint| being the |hint|.
+ ReferenceInfo* const reference_info = current_frame_->reference_info();
+ reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+ reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+ reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+ reference_info->skip_references[kReferenceFrameIntra] = true;
+ reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+ for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+ const auto reference_frame = static_cast<ReferenceFrameType>(i);
+ const uint8_t hint =
+ decoder_state_.reference_order_hint
+ [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+ reference_info->order_hint[reference_frame] = hint;
+ const int relative_distance_from =
+ GetRelativeDistance(hint, frame_header_.order_hint,
+ sequence_header_.order_hint_shift_bits);
+ const int relative_distance_to =
+ GetRelativeDistance(frame_header_.order_hint, hint,
+ sequence_header_.order_hint_shift_bits);
+ reference_info->relative_distance_from[reference_frame] =
+ relative_distance_from;
+ reference_info->relative_distance_to[reference_frame] =
+ relative_distance_to;
+ reference_info->skip_references[reference_frame] =
+ relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+ reference_info->projection_divisions[reference_frame] =
+ reference_info->skip_references[reference_frame]
+ ? 0
+ : kProjectionMvDivisionLookup[relative_distance_to];
+ decoder_state_.reference_frame_sign_bias[reference_frame] =
+ relative_distance_from > 0;
+ }
+ }
+ if (frame_header_.enable_cdf_update &&
+ !sequence_header_.reduced_still_picture_header) {
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.enable_frame_end_update_cdf = scratch == 0;
+ } else {
+ frame_header_.enable_frame_end_update_cdf = false;
+ }
+ return true;
+}
+
+bool ObuParser::ParseFrameHeader() {
+ // Section 6.8.1: It is a requirement of bitstream conformance that a
+ // sequence header OBU has been received before a frame header OBU.
+ if (!has_sequence_header_) return false;
+ if (!ParseFrameParameters()) return false;
+ if (frame_header_.show_existing_frame) return true;
+ assert(!obu_headers_.empty());
+ current_frame_->set_spatial_id(obu_headers_.back().spatial_id);
+ current_frame_->set_temporal_id(obu_headers_.back().temporal_id);
+ bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() &&
+ ParseSegmentationParameters();
+ if (!status) return false;
+ current_frame_->SetSegmentationParameters(frame_header_.segmentation);
+ status =
+ ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
+ if (!status) return false;
+ ComputeSegmentLosslessAndQIndex();
+ // Section 6.8.2: It is a requirement of bitstream conformance that
+ // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+ if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+ return false;
+ }
+ status = ParseLoopFilterParameters();
+ if (!status) return false;
+ current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
+ status = ParseCdefParameters() && ParseLoopRestorationParameters() &&
+ ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() &&
+ ParseSkipModeParameters() && ReadAllowWarpedMotion();
+ if (!status) return false;
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ frame_header_.reduced_tx_set = scratch != 0;
+ status = ParseGlobalMotionParameters();
+ if (!status) return false;
+ current_frame_->SetGlobalMotions(frame_header_.global_motion);
+ status = ParseFilmGrainParameters();
+ if (!status) return false;
+ if (sequence_header_.film_grain_params_present) {
+ current_frame_->set_film_grain_params(frame_header_.film_grain_params);
+ }
+ return true;
+}
+
+bool ObuParser::ParsePadding(const uint8_t* data, size_t size) {
+ // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So
+ // check trailing bits only if |size| > 0.
+ if (size == 0) return true;
+ // The payload of a padding OBU is byte aligned. Therefore the first
+ // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i < 0) {
+ LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+ return false;
+ }
+ if (data[i] != 0x80) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "The last nonzero byte of the payload data is 0x%x, should be 0x80.",
+ data[i]);
+ return false;
+ }
+ // Skip all bits before the trailing bit.
+ bit_reader_->SkipBytes(i);
+ return true;
+}
+
+bool ObuParser::ParseMetadataScalability() {
+ int64_t scratch;
+ // scalability_mode_idc
+ OBU_READ_LITERAL_OR_FAIL(8);
+ const auto scalability_mode_idc = static_cast<int>(scratch);
+ if (scalability_mode_idc == kScalabilitySS) {
+ // Parse scalability_structure().
+ // spatial_layers_cnt_minus_1
+ OBU_READ_LITERAL_OR_FAIL(2);
+ const auto spatial_layers_count = static_cast<int>(scratch) + 1;
+ // spatial_layer_dimensions_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto spatial_layer_dimensions_present_flag = scratch != 0;
+ // spatial_layer_description_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto spatial_layer_description_present_flag = scratch != 0;
+ // temporal_group_description_present_flag
+ OBU_READ_BIT_OR_FAIL;
+ const auto temporal_group_description_present_flag = scratch != 0;
+ // scalability_structure_reserved_3bits
+ OBU_READ_LITERAL_OR_FAIL(3);
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING,
+ "scalability_structure_reserved_3bits is not zero.");
+ }
+ if (spatial_layer_dimensions_present_flag) {
+ for (int i = 0; i < spatial_layers_count; ++i) {
+ // spatial_layer_max_width[i]
+ OBU_READ_LITERAL_OR_FAIL(16);
+ // spatial_layer_max_height[i]
+ OBU_READ_LITERAL_OR_FAIL(16);
+ }
+ }
+ if (spatial_layer_description_present_flag) {
+ for (int i = 0; i < spatial_layers_count; ++i) {
+ // spatial_layer_ref_id[i]
+ OBU_READ_LITERAL_OR_FAIL(8);
+ }
+ }
+ if (temporal_group_description_present_flag) {
+ // temporal_group_size
+ OBU_READ_LITERAL_OR_FAIL(8);
+ const auto temporal_group_size = static_cast<int>(scratch);
+ for (int i = 0; i < temporal_group_size; ++i) {
+ // temporal_group_temporal_id[i]
+ OBU_READ_LITERAL_OR_FAIL(3);
+ // temporal_group_temporal_switching_up_point_flag[i]
+ OBU_READ_BIT_OR_FAIL;
+ // temporal_group_spatial_switching_up_point_flag[i]
+ OBU_READ_BIT_OR_FAIL;
+ // temporal_group_ref_cnt[i]
+ OBU_READ_LITERAL_OR_FAIL(3);
+ const auto temporal_group_ref_count = static_cast<int>(scratch);
+ for (int j = 0; j < temporal_group_ref_count; ++j) {
+ // temporal_group_ref_pic_diff[i][j]
+ OBU_READ_LITERAL_OR_FAIL(8);
+ }
+ }
+ }
+ }
+ return true;
+}
+
+bool ObuParser::ParseMetadataTimecode() {
+ int64_t scratch;
+ // counting_type: should be the same for all pictures in the coded video
+ // sequence. 7..31 are reserved.
+ OBU_READ_LITERAL_OR_FAIL(5);
+ // full_timestamp_flag
+ OBU_READ_BIT_OR_FAIL;
+ const bool full_timestamp_flag = scratch != 0;
+ // discontinuity_flag
+ OBU_READ_BIT_OR_FAIL;
+ // cnt_dropped_flag
+ OBU_READ_BIT_OR_FAIL;
+ // n_frames
+ OBU_READ_LITERAL_OR_FAIL(9);
+ if (full_timestamp_flag) {
+ // seconds_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto seconds_value = static_cast<int>(scratch);
+ if (seconds_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+ return false;
+ }
+ // minutes_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto minutes_value = static_cast<int>(scratch);
+ if (minutes_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+ return false;
+ }
+ // hours_value
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto hours_value = static_cast<int>(scratch);
+ if (hours_value > 23) {
+ LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+ return false;
+ }
+ } else {
+ // seconds_flag
+ OBU_READ_BIT_OR_FAIL;
+ const bool seconds_flag = scratch != 0;
+ if (seconds_flag) {
+ // seconds_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto seconds_value = static_cast<int>(scratch);
+ if (seconds_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+ return false;
+ }
+ // minutes_flag
+ OBU_READ_BIT_OR_FAIL;
+ const bool minutes_flag = scratch != 0;
+ if (minutes_flag) {
+ // minutes_value
+ OBU_READ_LITERAL_OR_FAIL(6);
+ const auto minutes_value = static_cast<int>(scratch);
+ if (minutes_value > 59) {
+ LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+ return false;
+ }
+ // hours_flag
+ OBU_READ_BIT_OR_FAIL;
+ const bool hours_flag = scratch != 0;
+ if (hours_flag) {
+ // hours_value
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto hours_value = static_cast<int>(scratch);
+ if (hours_value > 23) {
+ LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+ return false;
+ }
+ }
+ }
+ }
+ }
+ // time_offset_length: should be the same for all pictures in the coded
+ // video sequence.
+ OBU_READ_LITERAL_OR_FAIL(5);
+ const auto time_offset_length = static_cast<int>(scratch);
+ if (time_offset_length > 0) {
+ // time_offset_value
+ OBU_READ_LITERAL_OR_FAIL(time_offset_length);
+ }
+ // Compute clockTimestamp. Section 6.7.7:
+ // When timing_info_present_flag is equal to 1 and discontinuity_flag is
+ // equal to 0, the value of clockTimestamp shall be greater than or equal
+ // to the value of clockTimestamp for the previous set of clock timestamp
+ // syntax elements in output order.
+ return true;
+}
+
+bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
+ const size_t start_offset = bit_reader_->byte_offset();
+ size_t metadata_type;
+ if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) {
+ LIBGAV1_DLOG(ERROR, "Could not read metadata_type.");
+ return false;
+ }
+ const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset;
+ if (size < metadata_type_size) {
+ LIBGAV1_DLOG(
+ ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.",
+ metadata_type_size, size);
+ return false;
+ }
+ data += metadata_type_size;
+ size -= metadata_type_size;
+ int64_t scratch;
+ switch (metadata_type) {
+ case kMetadataTypeHdrContentLightLevel: {
+ ObuMetadataHdrCll hdr_cll;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_cll.max_cll = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_cll.max_fall = scratch;
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ current_frame_->set_hdr_cll(hdr_cll);
+ break;
+ }
+ case kMetadataTypeHdrMasteringDisplayColorVolume: {
+ ObuMetadataHdrMdcv hdr_mdcv;
+ for (int i = 0; i < 3; ++i) {
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_mdcv.primary_chromaticity_x[i] = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_mdcv.primary_chromaticity_y[i] = scratch;
+ }
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_mdcv.white_point_chromaticity_x = scratch;
+ OBU_READ_LITERAL_OR_FAIL(16);
+ hdr_mdcv.white_point_chromaticity_y = scratch;
+ OBU_READ_LITERAL_OR_FAIL(32);
+ hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(32);
+ hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch);
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ current_frame_->set_hdr_mdcv(hdr_mdcv);
+ break;
+ }
+ case kMetadataTypeScalability:
+ if (!ParseMetadataScalability()) return false;
+ break;
+ case kMetadataTypeItutT35: {
+ ObuMetadataItutT35 itut_t35;
+ OBU_READ_LITERAL_OR_FAIL(8);
+ itut_t35.country_code = static_cast<uint8_t>(scratch);
+ ++data;
+ --size;
+ if (itut_t35.country_code == 0xFF) {
+ OBU_READ_LITERAL_OR_FAIL(8);
+ itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch);
+ ++data;
+ --size;
+ }
+ // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says:
+ // itut_t35.payload_bytes shall be bytes containing data registered as
+ // specified in Recommendation ITU-T T.35.
+ // Therefore itut_t35.payload_bytes is byte aligned and the first trailing
+ // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes
+ // is not defined in the AV1 spec, identify the end of
+ // itut_t35.payload_bytes by searching for the trailing bit.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i < 0) {
+ LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+ return false;
+ }
+ if (data[i] != 0x80) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "itut_t35.payload_bytes is not byte aligned. The last nonzero byte "
+ "of the payload data is 0x%x, should be 0x80.",
+ data[i]);
+ return false;
+ }
+ itut_t35.payload_size = i;
+ if (!EnsureCurrentFrameIsNotNull() ||
+ !current_frame_->set_itut_t35(itut_t35, data)) {
+ return false;
+ }
+ // Skip all bits before the trailing bit.
+ bit_reader_->SkipBytes(i);
+ break;
+ }
+ case kMetadataTypeTimecode:
+ if (!ParseMetadataTimecode()) return false;
+ break;
+ default: {
+ // metadata_type is equal to a value reserved for future use or a user
+ // private value.
+ //
+ // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU
+ // if they do not understand the metadata_type." Find the trailing bit
+ // and skip all bits before the trailing bit.
+ const int i = GetLastNonzeroByteIndex(data, size);
+ if (i >= 0) {
+ // The last 1 bit in the last nonzero byte is the trailing bit. Skip
+ // all bits before the trailing bit.
+ const int n = CountTrailingZeros(data[i]);
+ bit_reader_->SkipBits(i * 8 + 7 - n);
+ }
+ break;
+ }
+ }
+ return true;
+}
+
+bool ObuParser::AddTileBuffers(int start, int end, size_t total_size,
+ size_t tg_header_size,
+ size_t bytes_consumed_so_far) {
+ // Validate that the tile group start and end are within the allowed range.
+ if (start != next_tile_group_start_ || start > end ||
+ end >= frame_header_.tile_info.tile_count) {
+ LIBGAV1_DLOG(ERROR,
+ "Invalid tile group start %d or end %d: expected tile group "
+ "start %d, tile_count %d.",
+ start, end, next_tile_group_start_,
+ frame_header_.tile_info.tile_count);
+ return false;
+ }
+ next_tile_group_start_ = end + 1;
+
+ if (total_size < tg_header_size) {
+ LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)",
+ total_size, tg_header_size);
+ return false;
+ }
+ size_t bytes_left = total_size - tg_header_size;
+ const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size;
+ for (int tile_number = start; tile_number <= end; ++tile_number) {
+ size_t tile_size = 0;
+ if (tile_number != end) {
+ RawBitReader bit_reader(data, bytes_left);
+ if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes,
+ &tile_size)) {
+ LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
+ tile_number);
+ return false;
+ }
+ ++tile_size;
+ data += frame_header_.tile_info.tile_size_bytes;
+ bytes_left -= frame_header_.tile_info.tile_size_bytes;
+ if (tile_size > bytes_left) {
+ LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+ tile_number);
+ return false;
+ }
+ } else {
+ tile_size = bytes_left;
+ if (tile_size == 0) {
+ LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+ tile_number);
+ return false;
+ }
+ }
+ // The memory for this has been allocated in ParseTileInfoSyntax(). So it is
+ // safe to use push_back_unchecked here.
+ tile_buffers_.push_back_unchecked({data, tile_size});
+ data += tile_size;
+ bytes_left -= tile_size;
+ }
+ bit_reader_->SkipBytes(total_size - tg_header_size);
+ return true;
+}
+
+bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) {
+ const TileInfo* const tile_info = &frame_header_.tile_info;
+ const size_t start_offset = bit_reader_->byte_offset();
+ const int tile_bits =
+ tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+ if (tile_bits == 0) {
+ return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far);
+ }
+ int64_t scratch;
+ OBU_READ_BIT_OR_FAIL;
+ const bool tile_start_and_end_present_flag = scratch != 0;
+ if (!tile_start_and_end_present_flag) {
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return false;
+ }
+ return AddTileBuffers(0, tile_info->tile_count - 1, size, 1,
+ bytes_consumed_so_far);
+ }
+ if (obu_headers_.back().type == kObuFrame) {
+ // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of
+ // bitstream conformance that the value of tile_start_and_end_present_flag
+ // is equal to 0.
+ LIBGAV1_DLOG(ERROR,
+ "tile_start_and_end_present_flag must be 0 in Frame OBU");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ const int start = static_cast<int>(scratch);
+ OBU_READ_LITERAL_OR_FAIL(tile_bits);
+ const int end = static_cast<int>(scratch);
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return false;
+ }
+ const size_t tg_header_size = bit_reader_->byte_offset() - start_offset;
+ return AddTileBuffers(start, end, size, tg_header_size,
+ bytes_consumed_so_far);
+}
+
+bool ObuParser::ParseHeader() {
+ ObuHeader obu_header;
+ int64_t scratch = bit_reader_->ReadBit();
+ if (scratch != 0) {
+ LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(4);
+ obu_header.type = static_cast<libgav1::ObuType>(scratch);
+ OBU_READ_BIT_OR_FAIL;
+ const bool extension_flag = scratch != 0;
+ OBU_READ_BIT_OR_FAIL;
+ obu_header.has_size_field = scratch != 0;
+ OBU_READ_BIT_OR_FAIL; // reserved.
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero.");
+ }
+ obu_header.has_extension = extension_flag;
+ if (extension_flag) {
+ if (extension_disallowed_) {
+ LIBGAV1_DLOG(ERROR,
+ "OperatingPointIdc is 0, but obu_extension_flag is 1.");
+ return false;
+ }
+ OBU_READ_LITERAL_OR_FAIL(3);
+ obu_header.temporal_id = scratch;
+ OBU_READ_LITERAL_OR_FAIL(2);
+ obu_header.spatial_id = scratch;
+ OBU_READ_LITERAL_OR_FAIL(3); // reserved.
+ if (scratch != 0) {
+ LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero.");
+ }
+ } else {
+ obu_header.temporal_id = 0;
+ obu_header.spatial_id = 0;
+ }
+ return obu_headers_.push_back(obu_header);
+}
+
+#undef OBU_READ_UVLC_OR_FAIL
+#undef OBU_READ_LITERAL_OR_FAIL
+#undef OBU_READ_BIT_OR_FAIL
+#undef OBU_PARSER_FAIL
+#undef OBU_LOG_AND_RETURN_FALSE
+
+bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
+ bit_reader_.reset(new (std::nothrow) RawBitReader(data, size));
+ return bit_reader_ != nullptr;
+}
+
+bool ObuParser::EnsureCurrentFrameIsNotNull() {
+ if (current_frame_ != nullptr) return true;
+ current_frame_ = buffer_pool_->GetFreeBuffer();
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+ return false;
+ }
+ return true;
+}
+
+bool ObuParser::HasData() const { return size_ > 0; }
+
+StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
+ if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument;
+
+ assert(current_frame_ == nullptr);
+ // This is used to release any references held in case of parsing failure.
+ RefCountedBufferPtrCleanup current_frame_cleanup(¤t_frame_);
+
+ const uint8_t* data = data_;
+ size_t size = size_;
+
+ // Clear everything except the sequence header.
+ obu_headers_.clear();
+ frame_header_ = {};
+ tile_buffers_.clear();
+ next_tile_group_start_ = 0;
+ sequence_header_changed_ = false;
+
+ bool parsed_one_full_frame = false;
+ bool seen_frame_header = false;
+ const uint8_t* frame_header = nullptr;
+ size_t frame_header_size_in_bits = 0;
+ while (size > 0 && !parsed_one_full_frame) {
+ if (!InitBitReader(data, size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+ return kStatusOutOfMemory;
+ }
+ if (!ParseHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+ return kStatusBitstreamError;
+ }
+ const ObuHeader& obu_header = obu_headers_.back();
+ if (!obu_header.has_size_field) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "has_size_field is zero. libgav1 does not support such streams.");
+ return kStatusUnimplemented;
+ }
+ const size_t obu_header_size = bit_reader_->byte_offset();
+ size_t obu_size;
+ if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+ return kStatusBitstreamError;
+ }
+ const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size;
+ if (size - bit_reader_->byte_offset() < obu_size) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+ size - bit_reader_->bit_offset(), obu_size);
+ return kStatusBitstreamError;
+ }
+
+ const ObuType obu_type = obu_header.type;
+ if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter &&
+ has_sequence_header_ &&
+ sequence_header_.operating_point_idc[operating_point_] != 0 &&
+ obu_header.has_extension &&
+ (!InTemporalLayer(
+ sequence_header_.operating_point_idc[operating_point_],
+ obu_header.temporal_id) ||
+ !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_],
+ obu_header.spatial_id))) {
+ obu_headers_.pop_back();
+ bit_reader_->SkipBytes(obu_size);
+ data += bit_reader_->byte_offset();
+ size -= bit_reader_->byte_offset();
+ continue;
+ }
+
+ const size_t obu_start_position = bit_reader_->bit_offset();
+ // The bit_reader_ is byte aligned after reading obu_header and obu_size.
+ // Therefore the byte offset can be computed as obu_start_position >> 3
+ // below.
+ assert((obu_start_position & 7) == 0);
+ bool obu_skipped = false;
+ switch (obu_type) {
+ case kObuTemporalDelimiter:
+ break;
+ case kObuSequenceHeader:
+ if (!ParseSequenceHeader(seen_frame_header)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+ return kStatusBitstreamError;
+ }
+ if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Bitdepth %d is not supported. The maximum bitdepth is %d.",
+ sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH);
+ return kStatusUnimplemented;
+ }
+ break;
+ case kObuFrameHeader:
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame header found but frame header was already seen.");
+ return kStatusBitstreamError;
+ }
+ if (!ParseFrameHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU.");
+ return kStatusBitstreamError;
+ }
+ frame_header = &data[obu_start_position >> 3];
+ frame_header_size_in_bits =
+ bit_reader_->bit_offset() - obu_start_position;
+ seen_frame_header = true;
+ parsed_one_full_frame = frame_header_.show_existing_frame;
+ break;
+ case kObuRedundantFrameHeader: {
+ if (!seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Redundant frame header found but frame header was not "
+ "yet seen.");
+ return kStatusBitstreamError;
+ }
+ const size_t fh_size = (frame_header_size_in_bits + 7) >> 3;
+ if (obu_size < fh_size ||
+ memcmp(frame_header, &data[obu_start_position >> 3], fh_size) !=
+ 0) {
+ LIBGAV1_DLOG(ERROR,
+ "Redundant frame header differs from frame header.");
+ return kStatusBitstreamError;
+ }
+ bit_reader_->SkipBits(frame_header_size_in_bits);
+ break;
+ }
+ case kObuFrame: {
+ const size_t fh_start_offset = bit_reader_->byte_offset();
+ if (seen_frame_header) {
+ LIBGAV1_DLOG(ERROR,
+ "Frame header found but frame header was already seen.");
+ return kStatusBitstreamError;
+ }
+ if (!ParseFrameHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU.");
+ return kStatusBitstreamError;
+ }
+ // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a
+ // requirement of bitstream conformance that show_existing_frame is
+ // equal to 0.
+ if (frame_header_.show_existing_frame) {
+ LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1.");
+ return kStatusBitstreamError;
+ }
+ if (!bit_reader_->AlignToNextByte()) {
+ LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+ return kStatusBitstreamError;
+ }
+ const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset;
+ if (fh_size >= obu_size) {
+ LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).",
+ fh_size, obu_size);
+ return kStatusBitstreamError;
+ }
+ if (!ParseTileGroup(obu_size - fh_size,
+ size_ - size + bit_reader_->byte_offset())) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU.");
+ return kStatusBitstreamError;
+ }
+ parsed_one_full_frame = true;
+ break;
+ }
+ case kObuTileGroup:
+ if (!ParseTileGroup(obu_size,
+ size_ - size + bit_reader_->byte_offset())) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU.");
+ return kStatusBitstreamError;
+ }
+ parsed_one_full_frame =
+ (next_tile_group_start_ == frame_header_.tile_info.tile_count);
+ break;
+ case kObuTileList:
+ LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported.");
+ return kStatusUnimplemented;
+ case kObuPadding:
+ if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU.");
+ return kStatusBitstreamError;
+ }
+ break;
+ case kObuMetadata:
+ if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU.");
+ return kStatusBitstreamError;
+ }
+ break;
+ default:
+ // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use
+ // and shall be ignored by AV1 decoder.
+ bit_reader_->SkipBytes(obu_size);
+ obu_skipped = true;
+ break;
+ }
+ if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame &&
+ obu_type != kObuTileGroup) {
+ const size_t parsed_obu_size_in_bits =
+ bit_reader_->bit_offset() - obu_start_position;
+ if (obu_size * 8 < parsed_obu_size_in_bits) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Parsed OBU size (%zu bits) is greater than expected OBU size "
+ "(%zu bytes) obu_type: %d.",
+ parsed_obu_size_in_bits, obu_size, obu_type);
+ return kStatusBitstreamError;
+ }
+ if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 -
+ parsed_obu_size_in_bits)) {
+ LIBGAV1_DLOG(ERROR,
+ "Error when verifying trailing bits for obu type: %d",
+ obu_type);
+ return kStatusBitstreamError;
+ }
+ }
+ const size_t bytes_consumed = bit_reader_->byte_offset();
+ const size_t consumed_obu_size =
+ bytes_consumed - obu_length_size - obu_header_size;
+ if (consumed_obu_size != obu_size) {
+ LIBGAV1_DLOG(ERROR,
+ "OBU size (%zu) and consumed size (%zu) does not match for "
+ "obu_type: %d.",
+ obu_size, consumed_obu_size, obu_type);
+ return kStatusBitstreamError;
+ }
+ data += bytes_consumed;
+ size -= bytes_consumed;
+ }
+ if (!parsed_one_full_frame && seen_frame_header) {
+ LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received.");
+ return kStatusBitstreamError;
+ }
+ data_ = data;
+ size_ = size;
+ *current_frame = std::move(current_frame_);
+ return kStatusOk;
+}
+
+// AV1CodecConfigurationBox specification:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox.
+// static
+std::unique_ptr<uint8_t[]> ObuParser::GetAV1CodecConfigurationBox(
+ const uint8_t* data, size_t size, size_t* const av1c_size) {
+ if (data == nullptr || av1c_size == nullptr) return nullptr;
+
+ ObuSequenceHeader sequence_header;
+ size_t sequence_header_offset;
+ size_t sequence_header_size;
+ const StatusCode status =
+ ParseBasicStreamInfo(data, size, &sequence_header,
+ &sequence_header_offset, &sequence_header_size);
+ if (status != kStatusOk) {
+ *av1c_size = 0;
+ return nullptr;
+ }
+
+ *av1c_size = 4 + sequence_header_size;
+ std::unique_ptr<uint8_t[]> av1c_ptr(new (std::nothrow) uint8_t[*av1c_size]);
+ if (av1c_ptr == nullptr) {
+ *av1c_size = 0;
+ return nullptr;
+ }
+ uint8_t* av1c = av1c_ptr.get();
+ // unsigned int (1) marker = 1;
+ // unsigned int (7) version = 1;
+ av1c[0] = 0x81;
+
+ // unsigned int (3) seq_profile;
+ // unsigned int (5) seq_level_idx_0;
+ const uint8_t seq_level_idx_0 = ((sequence_header.level[0].major - 2) << 2) |
+ sequence_header.level[0].minor;
+ av1c[1] = (sequence_header.profile << 5) | seq_level_idx_0;
+
+ // unsigned int (1) seq_tier_0;
+ // unsigned int (1) high_bitdepth;
+ // unsigned int (1) twelve_bit;
+ // unsigned int (1) monochrome;
+ // unsigned int (1) chroma_subsampling_x;
+ // unsigned int (1) chroma_subsampling_y;
+ // unsigned int (2) chroma_sample_position;
+ const auto high_bitdepth =
+ static_cast<uint8_t>(sequence_header.color_config.bitdepth > 8);
+ const auto twelve_bit =
+ static_cast<uint8_t>(sequence_header.color_config.bitdepth == 12);
+ av1c[2] =
+ (sequence_header.tier[0] << 7) | (high_bitdepth << 6) |
+ (twelve_bit << 5) |
+ (static_cast<uint8_t>(sequence_header.color_config.is_monochrome) << 4) |
+ (sequence_header.color_config.subsampling_x << 3) |
+ (sequence_header.color_config.subsampling_y << 2) |
+ sequence_header.color_config.chroma_sample_position;
+
+ // unsigned int (3) reserved = 0;
+ // unsigned int (1) initial_presentation_delay_present;
+ // if (initial_presentation_delay_present) {
+ // unsigned int (4) initial_presentation_delay_minus_one;
+ // } else {
+ // unsigned int (4) reserved = 0;
+ // }
+ av1c[3] = 0;
+
+ // unsigned int (8) configOBUs[];
+ memcpy(av1c + 4, data + sequence_header_offset, sequence_header_size);
+
+ return av1c_ptr;
+}
+
+// static
+StatusCode ObuParser::ParseBasicStreamInfo(const uint8_t* data, size_t size,
+ ObuSequenceHeader* sequence_header,
+ size_t* sequence_header_offset,
+ size_t* sequence_header_size) {
+ DecoderState state;
+ ObuParser parser(nullptr, 0, 0, nullptr, &state);
+ if (!parser.InitBitReader(data, size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+ return kStatusOutOfMemory;
+ }
+ while (!parser.bit_reader_->Finished()) {
+ const size_t obu_start_offset = parser.bit_reader_->byte_offset();
+ if (!parser.ParseHeader()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+ return kStatusBitstreamError;
+ }
+ const ObuHeader& obu_header = parser.obu_headers_.back();
+ if (!obu_header.has_size_field) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "has_size_field is zero. libgav1 does not support such streams.");
+ return kStatusUnimplemented;
+ }
+ size_t obu_size;
+ if (!parser.bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+ LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+ return kStatusBitstreamError;
+ }
+ if (size - parser.bit_reader_->byte_offset() < obu_size) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+ size - parser.bit_reader_->bit_offset(), obu_size);
+ return kStatusBitstreamError;
+ }
+ if (obu_header.type != kObuSequenceHeader) {
+ parser.obu_headers_.pop_back();
+ parser.bit_reader_->SkipBytes(obu_size);
+ continue;
+ }
+ const size_t obu_start_position = parser.bit_reader_->bit_offset();
+ if (!parser.ParseSequenceHeader(false)) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+ return kStatusBitstreamError;
+ }
+ const size_t parsed_obu_size_in_bits =
+ parser.bit_reader_->bit_offset() - obu_start_position;
+ const uint64_t obu_size_in_bits = static_cast<uint64_t>(obu_size) * 8;
+ if (obu_size_in_bits < parsed_obu_size_in_bits) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Parsed OBU size (%zu bits) is greater than expected OBU size "
+ "(%zu bytes)..",
+ parsed_obu_size_in_bits, obu_size);
+ return kStatusBitstreamError;
+ }
+ if (!parser.bit_reader_->VerifyAndSkipTrailingBits(
+ static_cast<size_t>(obu_size_in_bits - parsed_obu_size_in_bits))) {
+ LIBGAV1_DLOG(
+ ERROR, "Error when verifying trailing bits for the sequence header.");
+ return kStatusBitstreamError;
+ }
+ *sequence_header = parser.sequence_header_;
+ *sequence_header_offset = obu_start_offset;
+ *sequence_header_size =
+ parser.bit_reader_->byte_offset() - obu_start_offset;
+ return kStatusOk;
+ }
+ // Sequence header was never found.
+ return kStatusBitstreamError;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_OBU_PARSER_H_
+#define LIBGAV1_SRC_OBU_PARSER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/quantizer.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// structs and enums related to Open Bitstream Units (OBU).
+
+enum {
+ kMinimumMajorBitstreamLevel = 2,
+ kSelectScreenContentTools = 2,
+ kSelectIntegerMv = 2,
+ kLoopRestorationTileSizeMax = 256,
+ kGlobalMotionAlphaBits = 12,
+ kGlobalMotionTranslationBits = 12,
+ kGlobalMotionTranslationOnlyBits = 9,
+ kGlobalMotionAlphaPrecisionBits = 15,
+ kGlobalMotionTranslationPrecisionBits = 6,
+ kGlobalMotionTranslationOnlyPrecisionBits = 3,
+ kMaxTileWidth = 4096,
+ kMaxTileArea = 4096 * 2304,
+ kPrimaryReferenceNone = 7,
+ // A special value of the scalability_mode_idc syntax element that indicates
+ // the picture prediction structure is specified in scalability_structure().
+ kScalabilitySS = 14
+}; // anonymous enum
+
+struct ObuHeader {
+ ObuType type;
+ bool has_extension;
+ bool has_size_field;
+ int8_t temporal_id;
+ int8_t spatial_id;
+};
+
+enum BitstreamProfile : uint8_t {
+ kProfile0,
+ kProfile1,
+ kProfile2,
+ kMaxProfiles
+};
+
+// In the bitstream the level is encoded in five bits: the first three bits
+// encode |major| - 2 and the last two bits encode |minor|.
+//
+// If the mapped level (major.minor) is in the tables in Annex A.3, there are
+// bitstream conformance requirements on the maximum or minimum values of
+// several variables. The encoded value of 31 (which corresponds to the mapped
+// level 9.3) is the "maximum parameters" level and imposes no level-based
+// constraints on the bitstream.
+struct BitStreamLevel {
+ uint8_t major; // Range: 2-9.
+ uint8_t minor; // Range: 0-3.
+};
+
+struct ColorConfig {
+ int8_t bitdepth;
+ bool is_monochrome;
+ ColorPrimary color_primary;
+ TransferCharacteristics transfer_characteristics;
+ MatrixCoefficients matrix_coefficients;
+ // A binary value (0 or 1) that is associated with the VideoFullRangeFlag
+ // variable specified in ISO/IEC 23091-4/ITUT H.273.
+ // * 0: the studio swing representation.
+ // * 1: the full swing representation.
+ ColorRange color_range;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+ ChromaSamplePosition chroma_sample_position;
+ bool separate_uv_delta_q;
+};
+
+struct TimingInfo {
+ uint32_t num_units_in_tick;
+ uint32_t time_scale;
+ bool equal_picture_interval;
+ uint32_t num_ticks_per_picture;
+};
+
+struct DecoderModelInfo {
+ uint8_t encoder_decoder_buffer_delay_length;
+ uint32_t num_units_in_decoding_tick;
+ uint8_t buffer_removal_time_length;
+ uint8_t frame_presentation_time_length;
+};
+
+struct OperatingParameters {
+ uint32_t decoder_buffer_delay[kMaxOperatingPoints];
+ uint32_t encoder_buffer_delay[kMaxOperatingPoints];
+ bool low_delay_mode_flag[kMaxOperatingPoints];
+};
+
+struct ObuSequenceHeader {
+ // Section 7.5:
+ // Within a particular coded video sequence, the contents of
+ // sequence_header_obu must be bit-identical each time the sequence header
+ // appears except for the contents of operating_parameters_info. A new
+ // coded video sequence is required if the sequence header parameters
+ // change.
+ //
+ // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For
+ // this to work, this object and the |old| object must be initialized with
+ // an empty brace-enclosed list, which initializes any padding to zero bits.
+ // See https://en.cppreference.com/w/cpp/language/zero_initialization.
+ bool ParametersChanged(const ObuSequenceHeader& old) const;
+
+ BitstreamProfile profile;
+ bool still_picture;
+ bool reduced_still_picture_header;
+ int operating_points;
+ int operating_point_idc[kMaxOperatingPoints];
+ BitStreamLevel level[kMaxOperatingPoints];
+ int8_t tier[kMaxOperatingPoints];
+ int8_t frame_width_bits;
+ int8_t frame_height_bits;
+ int32_t max_frame_width;
+ int32_t max_frame_height;
+ bool frame_id_numbers_present;
+ int8_t frame_id_length_bits;
+ int8_t delta_frame_id_length_bits;
+ bool use_128x128_superblock;
+ bool enable_filter_intra;
+ bool enable_intra_edge_filter;
+ bool enable_interintra_compound;
+ bool enable_masked_compound;
+ bool enable_warped_motion;
+ bool enable_dual_filter;
+ bool enable_order_hint;
+ // If enable_order_hint is true, order_hint_bits is in the range [1, 8].
+ // If enable_order_hint is false, order_hint_bits is 0.
+ int8_t order_hint_bits;
+ // order_hint_shift_bits equals (32 - order_hint_bits) % 32.
+ // This is used frequently in GetRelativeDistance().
+ uint8_t order_hint_shift_bits;
+ bool enable_jnt_comp;
+ bool enable_ref_frame_mvs;
+ bool choose_screen_content_tools;
+ int8_t force_screen_content_tools;
+ bool choose_integer_mv;
+ int8_t force_integer_mv;
+ bool enable_superres;
+ bool enable_cdef;
+ bool enable_restoration;
+ ColorConfig color_config;
+ bool timing_info_present_flag;
+ TimingInfo timing_info;
+ bool decoder_model_info_present_flag;
+ DecoderModelInfo decoder_model_info;
+ bool decoder_model_present_for_operating_point[kMaxOperatingPoints];
+ bool initial_display_delay_present_flag;
+ uint8_t initial_display_delay[kMaxOperatingPoints];
+ bool film_grain_params_present;
+
+ // IMPORTANT: the operating_parameters member must be at the end of the
+ // struct so that ParametersChanged() can be implemented with a memcmp()
+ // call.
+ OperatingParameters operating_parameters;
+};
+// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp
+// to compare two ObuSequenceHeader objects.
+static_assert(std::is_standard_layout<ObuSequenceHeader>::value, "");
+// Verify operating_parameters is the last member of ObuSequenceHeader. The
+// second assertion assumes that ObuSequenceHeader has no padding after the
+// operating_parameters field. The first assertion is a sufficient condition
+// for ObuSequenceHeader to have no padding after the operating_parameters
+// field.
+static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), "");
+static_assert(sizeof(ObuSequenceHeader) ==
+ offsetof(ObuSequenceHeader, operating_parameters) +
+ sizeof(OperatingParameters),
+ "");
+
+struct TileBuffer {
+ const uint8_t* data;
+ size_t size;
+};
+
+enum MetadataType : uint8_t {
+ // 0 is reserved for AOM use.
+ kMetadataTypeHdrContentLightLevel = 1,
+ kMetadataTypeHdrMasteringDisplayColorVolume = 2,
+ kMetadataTypeScalability = 3,
+ kMetadataTypeItutT35 = 4,
+ kMetadataTypeTimecode = 5,
+ // 6-31 are unregistered user private.
+ // 32 and greater are reserved for AOM use.
+};
+
+class ObuParser : public Allocable {
+ public:
+ ObuParser(const uint8_t* const data, size_t size, int operating_point,
+ BufferPool* const buffer_pool, DecoderState* const decoder_state)
+ : data_(data),
+ size_(size),
+ operating_point_(operating_point),
+ buffer_pool_(buffer_pool),
+ decoder_state_(*decoder_state) {}
+
+ // Not copyable or movable.
+ ObuParser(const ObuParser& rhs) = delete;
+ ObuParser& operator=(const ObuParser& rhs) = delete;
+
+ // Returns true if there is more data that needs to be parsed.
+ bool HasData() const;
+
+ // Parses a sequence of Open Bitstream Units until a decodable frame is found
+ // (or until the end of stream is reached). A decodable frame is considered to
+ // be found when one of the following happens:
+ // * A kObuFrame is seen.
+ // * The kObuTileGroup containing the last tile is seen.
+ // * A kFrameHeader with show_existing_frame = true is seen.
+ //
+ // If the parsing is successful, relevant fields will be populated. The fields
+ // are valid only if the return value is kStatusOk. Returns kStatusOk on
+ // success, an error status otherwise. On success, |current_frame| will be
+ // populated with a valid frame buffer.
+ StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame);
+
+ // Get the AV1CodecConfigurationBox as described in
+ // https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox. This
+ // does minimal bitstream parsing to obtain the necessary information to
+ // generate the av1c box. Returns a std::unique_ptr that contains the av1c
+ // data on success, nullptr otherwise. |av1c_size| must not be nullptr and
+ // will contain the size of the buffer pointed to by the std::unique_ptr.
+ static std::unique_ptr<uint8_t[]> GetAV1CodecConfigurationBox(
+ const uint8_t* data, size_t size, size_t* av1c_size);
+
+ // Getters. Only valid if ParseOneFrame() completes successfully.
+ const Vector<ObuHeader>& obu_headers() const { return obu_headers_; }
+ const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+ const ObuFrameHeader& frame_header() const { return frame_header_; }
+ const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
+ // Returns true if the last call to ParseOneFrame() encountered a sequence
+ // header change.
+ bool sequence_header_changed() const { return sequence_header_changed_; }
+
+ // Setters.
+ void set_sequence_header(const ObuSequenceHeader& sequence_header) {
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ }
+
+ // Moves |tile_buffers_| into |tile_buffers|.
+ void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
+ *tile_buffers = std::move(tile_buffers_);
+ }
+
+ private:
+ // Initializes the bit reader. This is a function of its own to make unit
+ // testing of private functions simpler.
+ LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size);
+
+ // Parse helper functions.
+ bool ParseHeader(); // 5.3.2 and 5.3.3.
+ bool ParseColorConfig(ObuSequenceHeader* sequence_header); // 5.5.2.
+ bool ParseTimingInfo(ObuSequenceHeader* sequence_header); // 5.5.3.
+ bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header); // 5.5.4.
+ bool ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+ int index); // 5.5.5.
+ bool ParseSequenceHeader(bool seen_frame_header); // 5.5.1.
+ bool ParseFrameParameters(); // 5.9.2, 5.9.7 and 5.9.10.
+ void MarkInvalidReferenceFrames(); // 5.9.4.
+ bool ParseFrameSizeAndRenderSize(); // 5.9.5 and 5.9.6.
+ bool ParseSuperResParametersAndComputeImageSize(); // 5.9.8 and 5.9.9.
+ // Checks the bitstream conformance requirement in Section 6.8.6.
+ bool ValidateInterFrameSize() const;
+ bool ParseReferenceOrderHint();
+ static int FindLatestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindEarliestBackwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindLatestForwardReference(
+ const int current_frame_hint,
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+ const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+ static int FindReferenceWithSmallestOutputOrder(
+ const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints);
+ bool SetFrameReferences(int8_t last_frame_idx,
+ int8_t gold_frame_idx); // 7.8.
+ bool ParseLoopFilterParameters(); // 5.9.11.
+ bool ParseDeltaQuantizer(int8_t* delta); // 5.9.13.
+ bool ParseQuantizerParameters(); // 5.9.12.
+ bool ParseSegmentationParameters(); // 5.9.14.
+ bool ParseQuantizerIndexDeltaParameters(); // 5.9.17.
+ bool ParseLoopFilterDeltaParameters(); // 5.9.18.
+ void ComputeSegmentLosslessAndQIndex();
+ bool ParseCdefParameters(); // 5.9.19.
+ bool ParseLoopRestorationParameters(); // 5.9.20.
+ bool ParseTxModeSyntax(); // 5.9.21.
+ bool ParseFrameReferenceModeSyntax(); // 5.9.23.
+ // Returns whether skip mode is allowed. When it returns true, it also sets
+ // the frame_header_.skip_mode_frame array.
+ bool IsSkipModeAllowed();
+ bool ParseSkipModeParameters(); // 5.9.22.
+ bool ReadAllowWarpedMotion();
+ bool ParseGlobalParamSyntax(
+ int ref, int index,
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+ prev_global_motions); // 5.9.25.
+ bool ParseGlobalMotionParameters(); // 5.9.24.
+ bool ParseFilmGrainParameters(); // 5.9.30.
+ bool ParseTileInfoSyntax(); // 5.9.15.
+ bool ParseFrameHeader(); // 5.9.
+ // |data| and |size| specify the payload data of the padding OBU.
+ // NOTE: Although the payload data is available in the bit_reader_ member,
+ // it is also passed to ParsePadding() as function parameters so that
+ // ParsePadding() can find the trailing bit of the OBU and skip over the
+ // payload data as an opaque chunk of data.
+ bool ParsePadding(const uint8_t* data, size_t size); // 5.7.
+ bool ParseMetadataScalability(); // 5.8.5 and 5.8.6.
+ bool ParseMetadataTimecode(); // 5.8.7.
+ // |data| and |size| specify the payload data of the metadata OBU.
+ // NOTE: Although the payload data is available in the bit_reader_ member,
+ // it is also passed to ParseMetadata() as function parameters so that
+ // ParseMetadata() can find the trailing bit of the OBU and either extract
+ // or skip over the payload data as an opaque chunk of data.
+ bool ParseMetadata(const uint8_t* data, size_t size); // 5.8.
+ // Adds and populates the TileBuffer for each tile in the tile group and
+ // updates |next_tile_group_start_|
+ bool AddTileBuffers(int start, int end, size_t total_size,
+ size_t tg_header_size, size_t bytes_consumed_so_far);
+ bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1.
+
+ // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is
+ // nullptr. Does not do anything otherwise. Returns true on success, false
+ // otherwise.
+ bool EnsureCurrentFrameIsNotNull();
+
+ // Parses the basic bitstream information from the given AV1 stream in |data|.
+ // This is used for generating the AV1CodecConfigurationBox.
+ static StatusCode ParseBasicStreamInfo(const uint8_t* data, size_t size,
+ ObuSequenceHeader* sequence_header,
+ size_t* sequence_header_offset,
+ size_t* sequence_header_size);
+
+ // Parser elements.
+ std::unique_ptr<RawBitReader> bit_reader_;
+ const uint8_t* data_;
+ size_t size_;
+ const int operating_point_;
+
+ // OBU elements. Only valid if ParseOneFrame() completes successfully.
+ Vector<ObuHeader> obu_headers_;
+ ObuSequenceHeader sequence_header_ = {};
+ ObuFrameHeader frame_header_ = {};
+ Vector<TileBuffer> tile_buffers_;
+ // The expected starting tile number of the next Tile Group.
+ int next_tile_group_start_ = 0;
+ // If true, the sequence_header_ field is valid.
+ bool has_sequence_header_ = false;
+ // If true, it means that the last call to ParseOneFrame() encountered a
+ // sequence header change.
+ bool sequence_header_changed_ = false;
+ // If true, the obu_extension_flag syntax element in the OBU header must be
+ // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
+ bool extension_disallowed_ = false;
+
+ BufferPool* const buffer_pool_;
+ DecoderState& decoder_state_;
+ // Used by ParseOneFrame() to populate the current frame that is being
+ // decoded. The invariant maintained is that this variable will be nullptr at
+ // the beginning and at the end of each call to ParseOneFrame(). This ensures
+ // that the ObuParser is not holding on to any references to the current
+ // frame once the ParseOneFrame() call is complete.
+ RefCountedBufferPtr current_frame_;
+
+ // For unit testing private functions.
+ friend class ObuParserTest;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_OBU_PARSER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+// Note the following test classes access private functions/members of
+// ObuParser. To be declared friends of ObuParser they must not have internal
+// linkage (they must be outside the anonymous namespace).
+namespace libgav1 {
+
+// Helper class to manipulate individual bits and generate a byte string.
+class BytesAndBits {
+ public:
+ // Append a bit to the end.
+ void AppendBit(uint8_t bit) { bits_.push_back(bit != 0); }
+
+ // Append a byte to the end.
+ void AppendByte(uint8_t byte) {
+ for (int i = 0; i < 8; ++i) {
+ AppendBit(GetNthBit(byte, i, 8));
+ }
+ }
+
+ // Append a literal of size |bits| to the end.
+ void AppendLiteral(int bits, int value) {
+ InsertLiteral(static_cast<int>(bits_.size()), bits, value);
+ }
+
+ // Append an inverse signed literal to the end. |bits + 1| bits are appended.
+ void AppendInverseSignedLiteral(int bits, int value) {
+ InsertInverseSignedLiteral(static_cast<int>(bits_.size()), bits, value);
+ }
+
+ // Append a sequence of bytes to the end.
+ void AppendBytes(const std::vector<uint8_t>& bytes) {
+ for (const auto& byte : bytes) {
+ AppendByte(byte);
+ }
+ }
+
+ // Insert |bit| in |offset|. Moves all other bits to the right by 1.
+ void InsertBit(int offset, uint8_t bit) {
+ auto iterator = bits_.begin();
+ bits_.insert(iterator + offset, bit != 0);
+ }
+
+ // Insert |value| of size |bits| at offset |offset|. Moves all other bits to
+ // the right by |bits|.
+ void InsertLiteral(int offset, int bits, int value) {
+ for (int i = 0; i < bits; ++i) {
+ InsertBit(i + offset, GetNthBit(value, i, bits));
+ }
+ }
+
+ // Insert |value| of size |bits| at offset |offset| as an inverse signed
+ // literal. Move all other bits to the right by |bits + 1|.
+ //
+ // Note: This is denoted su(1+bits) in the spec.
+ void InsertInverseSignedLiteral(int offset, int bits, int value) {
+ InsertBit(offset, (value >= 0) ? 0 : 1);
+ InsertLiteral(offset + 1, bits, value);
+ }
+
+ // Insert |value| at |offset| as an unsigned variable length code (uvlc).
+ // Return the number of bits inserted.
+ int InsertUvlc(int offset, int value) {
+ int leading_zeros = 1;
+ int shift_value = ++value;
+ while ((shift_value >>= 1) != 0) leading_zeros += 2;
+ int bits = 0;
+ InsertLiteral(offset, leading_zeros >> 1, 0);
+ bits += leading_zeros >> 1;
+ InsertLiteral(offset + bits, (leading_zeros + 1) >> 1, value);
+ bits += (leading_zeros + 1) >> 1;
+ return bits;
+ }
+
+ // Set the bit at |offset| to |bit|. The bit should already exist.
+ void SetBit(int offset, uint8_t bit) { bits_[offset] = bit != 0; }
+
+ // Set |bits| starting at |offset| to |value|. The bits should already exist.
+ void SetLiteral(int offset, int bits, int value) {
+ for (int i = 0; i < bits; ++i) {
+ SetBit(offset + i, GetNthBit(value, i, bits));
+ }
+ }
+
+ // Remove a bit in |offset|. Moves over all the following bits to the left by
+ // 1.
+ void RemoveBit(int offset) { RemoveLiteral(offset, 1); }
+
+ // Remove a literal of size |bits| from |offset|. Moves over all the
+ // following bits to the left by |bits|.
+ void RemoveLiteral(int offset, int bits) {
+ bits_.erase(bits_.begin() + offset, bits_.begin() + offset + bits);
+ }
+
+ // Remove all bits after offset.
+ void RemoveAllBitsAfter(int offset) {
+ RemoveLiteral(offset, static_cast<int>(bits_.size()) - offset);
+ }
+
+ // Clear all the bits stored.
+ void Clear() { bits_.clear(); }
+
+ // Generate the data vector from the bits. Pads 0 to the end of the last byte
+ // if necessary.
+ const std::vector<uint8_t>& GenerateData() {
+ data_.clear();
+ for (size_t i = 0; i < bits_.size(); i += 8) {
+ uint8_t byte = 0;
+ for (int j = 0; j < 8; ++j) {
+ const uint8_t bit =
+ ((i + j) < bits_.size()) ? static_cast<uint8_t>(bits_[i + j]) : 0;
+ byte |= bit << (7 - j);
+ }
+ data_.push_back(byte);
+ }
+ return data_;
+ }
+
+ private:
+ // Get the |n|th MSB from |value| with the assumption that |value| has |size|
+ // bits.
+ static uint8_t GetNthBit(int value, int n, int size) {
+ return (value >> (size - n - 1)) & 0x01;
+ }
+
+ std::vector<uint8_t> data_;
+ std::vector<bool> bits_;
+};
+
+class ObuParserTest : public testing::Test {
+ protected:
+ // Constants for unit tests.
+ static constexpr int kFrameWidthBits = 9;
+ static constexpr int kFrameHeightBits = 8;
+ static constexpr int kHeight = 240;
+ static constexpr int kWidth = 426;
+ static constexpr int kRows4x4 = 60;
+ static constexpr int kColumns4x4 = 108;
+ static constexpr int kFrameToShow = 2;
+ static constexpr int kDisplayFrameId = 10;
+ static constexpr int kFrameIdLengthBits = 15;
+ static constexpr int kDeltaFrameIdLengthBits = 14;
+
+ // Bit streams for testing. These may contain trailing bits and tests may have
+ // to remove some of the trailing bits to keep the boundary alignment.
+ const std::vector<uint8_t> kDefaultTemporalDelimiter = {0x12, 0x00};
+ // Bits Syntax element Value
+ // 1 obu_forbidden_bit 0
+ // 4 obu_type 2 (OBU_TEMPORAL_DELIMITER)
+ // 1 obu_extension_flag 1
+ // 1 obu_has_size_field 1
+ // 1 obu_reserved_1bit 0
+ // 3 temporal_id 6
+ // 2 spatial_id 2
+ // 3 extension_header_reserved_3bits 0
+ // 8 obu_size 0
+ const std::vector<uint8_t> kDefaultTemporalDelimiterWithExtension = {
+ 0x16, 0xd0, 0x00};
+ const std::vector<uint8_t> kDefaultHeaderWithoutSizeField = {0x10};
+ // Offset Bits Syntax element Value
+ // 0 3 seq_profile 0
+ // 3 1 still_picture 0
+ // 4 1 reduced_still_picture_header 0
+ // 5 1 timing_info_present_flag 0
+ // 6 1 initial_display_delay_present_flag 0
+ // 7 5 operating_points_cnt_minus_1 0
+ // 12 12 operating_point_idc[ 0 ] 0
+ // 24 5 seq_level_idx[ 0 ] 0
+ // 29 4 frame_width_bits_minus_1 8
+ // 33 4 frame_height_bits_minus_1 7
+ // 37 9 max_frame_width_minus_1 425
+ // 46 8 max_frame_height_minus_1 239
+ // 54 1 frame_id_numbers_present_flag 0
+ // 55 1 use_128x128_superblock 1
+ // 56 1 enable_filter_intra 1
+ // 57 1 enable_intra_edge_filter 1
+ // 58 1 enable_interintra_compound 1
+ // 59 1 enable_masked_compound 1
+ // 60 1 enable_warped_motion 0
+ // 61 1 enable_dual_filter 1
+ // 62 1 enable_order_hint 1
+ // 63 1 enable_jnt_comp 1
+ // 64 1 enable_ref_frame_mvs 1
+ // 65 1 seq_choose_screen_content_tools 1
+ // 66 1 seq_choose_integer_mv 1
+ // 67 3 order_hint_bits_minus_1 6
+ // 70 1 enable_superres 0
+ // 71 1 enable_cdef 1
+ // 72 1 enable_restoration 1
+ // ...
+ const std::vector<uint8_t> kDefaultSequenceHeader = {
+ 0x00, 0x00, 0x00, 0x04, 0x3e, 0xa7, 0xbd, 0xf7, 0xf9, 0x80, 0x40};
+ const std::vector<uint8_t> kDefaultFrameHeaderKeyFrame = {0x10, 0x00};
+ // Bits Syntax element Value
+ // 1 show_existing_frame 0
+ // 2 frame_type 2 (kFrameIntraOnly)
+ // 1 show_frame 1
+ // 1 error_resilient_mode 0
+ // 1 disable_cdf_update 0
+ // 1 frame_size_override_flag 0
+ // 8 refresh_frame_flags 4
+ // ...
+ const std::vector<uint8_t> kDefaultFrameHeaderIntraOnlyFrame = {0x50, 0x08,
+ 0x00};
+ // Bits Syntax element Value
+ // 1 show_existing_frame 0
+ // 2 frame_type 1 (kFrameInter)
+ // 1 show_frame 1
+ // 1 error_resilient_mode 0
+ // 1 disable_cdf_update 0
+ // 1 frame_size_override_flag 0
+ // 3 primary_ref_frame 1
+ // 8 refresh_frame_flags 4
+ // 3 ref_frame_idx[0] 0
+ // 3 ref_frame_idx[1] 1
+ // 3 ref_frame_idx[2] 2
+ // 3 ref_frame_idx[3] 3
+ // 3 ref_frame_idx[4] 4
+ // 3 ref_frame_idx[5] 5
+ // 3 ref_frame_idx[6] 6
+ // ...
+ const std::vector<uint8_t> kDefaultFrameHeaderInterFrame = {0x30, 0x41, 0x01,
+ 0x4e, 0x5c, 0x60};
+ const std::vector<uint8_t> kDefaultGlobalMotionParametersRotZoom = {
+ 0xff, 0x50, 0x77, 0x7e, 0x1f, 0xcd};
+ const std::vector<uint8_t> kDefaultGlobalMotionParametersAffine = {
+ 0x3f, 0x50, 0x77, 0x7b, 0xbf, 0xa8, 0x3e, 0x1f, 0xcd};
+
+ void SetUp() override {
+ buffer_pool_.reset(new (std::nothrow)
+ BufferPool(nullptr, nullptr, nullptr, nullptr));
+ ASSERT_NE(buffer_pool_, nullptr);
+ }
+
+ bool Init() {
+ obu_.reset(new (std::nothrow) ObuParser(nullptr, 0, 0, buffer_pool_.get(),
+ &decoder_state_));
+ if (obu_ == nullptr) return false;
+ obu_headers_ = &obu_->obu_headers_;
+ obu_frame_header_ = &obu_->frame_header_;
+ obu_sequence_header_ = &obu_->sequence_header_;
+ return true;
+ }
+
+ bool Init(const std::vector<uint8_t>& data, bool init_bit_reader = true) {
+ obu_.reset(new (std::nothrow) ObuParser(
+ data.data(), data.size(), 0, buffer_pool_.get(), &decoder_state_));
+ if (obu_ == nullptr) return false;
+ obu_headers_ = &obu_->obu_headers_;
+ obu_frame_header_ = &obu_->frame_header_;
+ obu_sequence_header_ = &obu_->sequence_header_;
+ return init_bit_reader ? obu_->InitBitReader(data.data(), data.size())
+ : true;
+ }
+
+ bool Parse(const std::string& input,
+ const ObuSequenceHeader* const sequence_header = nullptr) {
+ std::vector<uint8_t> data(input.begin(), input.end());
+ return Parse(data, sequence_header);
+ }
+
+ bool Parse(const std::vector<uint8_t>& data,
+ const ObuSequenceHeader* const sequence_header = nullptr) {
+ EXPECT_TRUE(Init(data, false));
+ if (sequence_header != nullptr) obu_->set_sequence_header(*sequence_header);
+ return obu_->ParseOneFrame(¤t_frame_) == kStatusOk;
+ }
+
+ bool ParseSequenceHeader(const std::vector<uint8_t>& data) {
+ EXPECT_TRUE(Init(data));
+ return obu_->ParseSequenceHeader(/*seen_frame_header=*/false);
+ }
+
+ bool ParseFrameParameters(const std::vector<uint8_t>& data,
+ bool id_bits_present = false,
+ int force_screen_content_tools = 0,
+ int force_integer_mv = 0,
+ bool enable_superres = false) {
+ EXPECT_TRUE(Init(data));
+ if (id_bits_present) {
+ obu_->sequence_header_.frame_id_numbers_present = true;
+ obu_->sequence_header_.frame_id_length_bits = kFrameIdLengthBits;
+ obu_->sequence_header_.delta_frame_id_length_bits =
+ kDeltaFrameIdLengthBits;
+ }
+ obu_->sequence_header_.force_screen_content_tools =
+ force_screen_content_tools;
+ obu_->sequence_header_.force_integer_mv = force_integer_mv;
+ obu_->sequence_header_.enable_superres = enable_superres;
+ obu_->sequence_header_.frame_width_bits = kFrameWidthBits;
+ obu_->sequence_header_.frame_height_bits = kFrameHeightBits;
+ obu_->sequence_header_.max_frame_width = kWidth;
+ obu_->sequence_header_.max_frame_height = kHeight;
+ return obu_->ParseFrameParameters();
+ }
+
+ bool ParseSegmentationParameters(const std::vector<uint8_t>& data,
+ int primary_reference_frame,
+ int prev_frame_index) {
+ EXPECT_TRUE(Init(data));
+ obu_->frame_header_.primary_reference_frame = primary_reference_frame;
+ if (primary_reference_frame != kPrimaryReferenceNone) {
+ obu_->frame_header_.reference_frame_index[primary_reference_frame] =
+ prev_frame_index;
+ }
+ return obu_->ParseSegmentationParameters();
+ }
+
+ bool ParseFrameReferenceModeSyntax(const std::vector<uint8_t>& data,
+ FrameType frame_type) {
+ EXPECT_TRUE(Init(data));
+ obu_->frame_header_.frame_type = frame_type;
+ return obu_->ParseFrameReferenceModeSyntax();
+ }
+
+ bool ParseGlobalMotionParameters(const std::vector<uint8_t>& data,
+ FrameType frame_type) {
+ EXPECT_TRUE(Init(data));
+ obu_->frame_header_.frame_type = frame_type;
+ obu_->frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+ return obu_->ParseGlobalMotionParameters();
+ }
+
+ bool ParseFilmGrainParameters(const std::vector<uint8_t>& data,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header) {
+ EXPECT_TRUE(Init(data));
+ obu_->set_sequence_header(sequence_header);
+ obu_->frame_header_ = frame_header;
+ return obu_->ParseFilmGrainParameters();
+ }
+
+ bool ParseTileInfoSyntax(const std::vector<uint8_t>& data, int columns4x4,
+ int rows4x4, bool use_128x128_superblock) {
+ EXPECT_TRUE(Init(data));
+ obu_->frame_header_.columns4x4 = columns4x4;
+ obu_->frame_header_.rows4x4 = rows4x4;
+ obu_->sequence_header_.use_128x128_superblock = use_128x128_superblock;
+ return obu_->ParseTileInfoSyntax();
+ }
+
+ bool ParseMetadata(const std::vector<uint8_t>& data) {
+ EXPECT_TRUE(Init(data));
+ return obu_->ParseMetadata(data.data(), data.size());
+ }
+
+ void DefaultSequenceHeader(ObuSequenceHeader* const gold) {
+ memset(gold, 0, sizeof(*gold));
+ gold->profile = kProfile0;
+ gold->level[0].major = kMinimumMajorBitstreamLevel;
+ gold->operating_points = 1;
+ gold->max_frame_width = kWidth;
+ gold->max_frame_height = kHeight;
+ gold->frame_width_bits = kFrameWidthBits;
+ gold->frame_height_bits = kFrameHeightBits;
+ gold->use_128x128_superblock = true;
+ gold->enable_filter_intra = true;
+ gold->enable_intra_edge_filter = true;
+ gold->enable_interintra_compound = true;
+ gold->enable_masked_compound = true;
+ gold->enable_dual_filter = true;
+ gold->enable_order_hint = true;
+ gold->enable_jnt_comp = true;
+ gold->enable_ref_frame_mvs = true;
+ gold->choose_screen_content_tools = true;
+ gold->force_screen_content_tools = 2;
+ gold->choose_integer_mv = true;
+ gold->force_integer_mv = 2;
+ gold->order_hint_bits = 7;
+ gold->enable_cdef = true;
+ gold->enable_restoration = true;
+ gold->color_config.bitdepth = 8;
+ gold->color_config.color_primary = kColorPrimaryUnspecified;
+ gold->color_config.transfer_characteristics =
+ kTransferCharacteristicsUnspecified;
+ gold->color_config.matrix_coefficients = kMatrixCoefficientsUnspecified;
+ gold->color_config.subsampling_x = 1;
+ gold->color_config.subsampling_y = 1;
+ }
+
+ void DefaultFrameHeader(ObuFrameHeader* const gold, FrameType frame_type) {
+ memset(gold, 0, sizeof(*gold));
+ gold->frame_type = frame_type;
+ gold->show_frame = true;
+ gold->showable_frame = (frame_type != kFrameKey);
+ gold->enable_cdf_update = true;
+ gold->width = kWidth;
+ gold->height = kHeight;
+ gold->render_width = kWidth;
+ gold->render_height = kHeight;
+ gold->upscaled_width = kWidth;
+ gold->primary_reference_frame = kPrimaryReferenceNone;
+ gold->enable_frame_end_update_cdf = true;
+ gold->rows4x4 = kRows4x4;
+ gold->columns4x4 = kColumns4x4;
+ if (frame_type == kFrameKey) {
+ gold->refresh_frame_flags = 0xff;
+ gold->error_resilient_mode = true;
+ gold->force_integer_mv = 1;
+ } else if (frame_type == kFrameIntraOnly) {
+ gold->refresh_frame_flags = 4;
+ gold->force_integer_mv = 1;
+ } else if (frame_type == kFrameInter) {
+ gold->refresh_frame_flags = 4;
+ gold->primary_reference_frame = 1;
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ gold->reference_frame_index[i] = i;
+ }
+ gold->is_motion_mode_switchable = true;
+ }
+ }
+
+ void OverrideFrameSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+ int flag_offset, int size_offset) {
+ data->SetBit(flag_offset, 1); // frame_size_override_flag.
+ data->InsertLiteral(size_offset, kFrameWidthBits,
+ kWidth - 2); // frame_width_minus_1.
+ data->InsertLiteral(size_offset + kFrameWidthBits, kFrameHeightBits,
+ kHeight - 2); // frame_height_minus_1.
+ gold->frame_size_override_flag = true;
+ gold->width = kWidth - 1;
+ gold->height = kHeight - 1;
+ gold->render_width = gold->width;
+ gold->render_height = gold->height;
+ gold->upscaled_width = gold->width;
+ }
+
+ void OverrideRenderSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+ int flag_offset) {
+ data->SetBit(flag_offset, 1); // render_and_frame_size_different.
+ data->InsertLiteral(flag_offset + 1, 16,
+ kWidth - 10); // render_width_minus_1.
+ data->InsertLiteral(flag_offset + 17, 16,
+ kHeight - 10); // render_height_minus_1.
+ gold->render_width = kWidth - 9;
+ gold->render_height = kHeight - 9;
+ gold->render_and_frame_size_different = true;
+ }
+
+ void OverrideSegmentation(BytesAndBits* const data, Segmentation* const gold,
+ int offset) {
+ gold->update_data = true;
+ data->SetBit(offset++, static_cast<uint8_t>(gold->update_data));
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ gold->segment_id_pre_skip = false;
+ gold->last_active_segment_id = 0;
+ for (int i = 0; i < kMaxSegments; ++i) {
+ for (int j = 0; j < kSegmentFeatureMax; ++j) {
+ gold->feature_enabled[i][j] = static_cast<bool>(rnd.Rand8() & 1);
+ data->InsertBit(offset++,
+ static_cast<uint8_t>(gold->feature_enabled[i][j]));
+ if (gold->feature_enabled[i][j]) {
+ gold->feature_data[i][j] = rnd(1 << kSegmentationFeatureBits[j]);
+ if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+ if (static_cast<bool>(rnd.Rand8() & 1)) {
+ gold->feature_data[i][j] *= -1;
+ }
+ data->InsertInverseSignedLiteral(
+ offset, kSegmentationFeatureBits[j], gold->feature_data[i][j]);
+ offset += kSegmentationFeatureBits[j] + 1;
+ } else {
+ data->InsertLiteral(offset, kSegmentationFeatureBits[j],
+ gold->feature_data[i][j]);
+ offset += kSegmentationFeatureBits[j];
+ }
+ gold->last_active_segment_id = i;
+ if (j >= kSegmentFeatureReferenceFrame) {
+ gold->segment_id_pre_skip = true;
+ }
+ }
+ }
+ }
+ }
+
+ void VerifyObuHeader(bool extension) {
+ EXPECT_EQ(obu_->obu_headers().back().temporal_id, extension ? 6 : 0);
+ EXPECT_EQ(obu_->obu_headers().back().spatial_id, extension ? 2 : 0);
+ }
+
+#define OBU_TEST_COMPARE(x) EXPECT_EQ(expected.x, actual.x)
+ void VerifyFrameParameters(const ObuFrameHeader& expected,
+ bool id_bits_present = false) {
+ const ObuFrameHeader& actual = obu_->frame_header();
+ OBU_TEST_COMPARE(show_existing_frame);
+ if (actual.show_existing_frame) {
+ OBU_TEST_COMPARE(frame_to_show);
+ OBU_TEST_COMPARE(frame_presentation_time);
+ if (id_bits_present) {
+ OBU_TEST_COMPARE(display_frame_id);
+ }
+ return;
+ }
+ OBU_TEST_COMPARE(frame_type);
+ OBU_TEST_COMPARE(show_frame);
+ OBU_TEST_COMPARE(frame_presentation_time);
+ OBU_TEST_COMPARE(showable_frame);
+ OBU_TEST_COMPARE(error_resilient_mode);
+ OBU_TEST_COMPARE(enable_cdf_update);
+ OBU_TEST_COMPARE(current_frame_id);
+ OBU_TEST_COMPARE(frame_size_override_flag);
+ OBU_TEST_COMPARE(order_hint);
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ OBU_TEST_COMPARE(reference_order_hint[i]);
+ }
+ OBU_TEST_COMPARE(primary_reference_frame);
+ OBU_TEST_COMPARE(width);
+ OBU_TEST_COMPARE(height);
+ OBU_TEST_COMPARE(render_and_frame_size_different);
+ OBU_TEST_COMPARE(render_width);
+ OBU_TEST_COMPARE(render_height);
+ OBU_TEST_COMPARE(upscaled_width);
+ OBU_TEST_COMPARE(coded_lossless);
+ OBU_TEST_COMPARE(upscaled_lossless);
+ OBU_TEST_COMPARE(allow_screen_content_tools);
+ OBU_TEST_COMPARE(is_motion_mode_switchable);
+ OBU_TEST_COMPARE(refresh_frame_flags);
+ OBU_TEST_COMPARE(enable_frame_end_update_cdf);
+ OBU_TEST_COMPARE(force_integer_mv);
+ if (actual.frame_type == kFrameInter) {
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ OBU_TEST_COMPARE(reference_frame_index[i]);
+ }
+ }
+ OBU_TEST_COMPARE(use_superres);
+ OBU_TEST_COMPARE(rows4x4);
+ OBU_TEST_COMPARE(columns4x4);
+ }
+
+ void VerifyLoopFilterParameters(const LoopFilter& expected) {
+ const LoopFilter& actual = obu_->frame_header().loop_filter;
+ for (int i = 0; i < 4; ++i) {
+ OBU_TEST_COMPARE(level[i]);
+ }
+ OBU_TEST_COMPARE(sharpness);
+ OBU_TEST_COMPARE(delta_enabled);
+ OBU_TEST_COMPARE(delta_update);
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ OBU_TEST_COMPARE(ref_deltas[i]);
+ }
+ for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+ OBU_TEST_COMPARE(mode_deltas[i]);
+ }
+ }
+
+ void VerifyQuantizerParameters(const QuantizerParameters& expected) {
+ const QuantizerParameters& actual = obu_->frame_header().quantizer;
+ OBU_TEST_COMPARE(base_index);
+ OBU_TEST_COMPARE(delta_dc[kPlaneY]);
+ OBU_TEST_COMPARE(delta_dc[kPlaneU]);
+ OBU_TEST_COMPARE(delta_dc[kPlaneV]);
+ EXPECT_EQ(0, actual.delta_ac[kPlaneY]);
+ OBU_TEST_COMPARE(delta_ac[kPlaneY]);
+ OBU_TEST_COMPARE(delta_ac[kPlaneU]);
+ OBU_TEST_COMPARE(delta_ac[kPlaneV]);
+ OBU_TEST_COMPARE(use_matrix);
+ OBU_TEST_COMPARE(matrix_level[kPlaneY]);
+ OBU_TEST_COMPARE(matrix_level[kPlaneU]);
+ OBU_TEST_COMPARE(matrix_level[kPlaneV]);
+ }
+
+ void VerifySegmentationParameters(const Segmentation& expected) {
+ const Segmentation& actual = obu_->frame_header().segmentation;
+ OBU_TEST_COMPARE(enabled);
+ OBU_TEST_COMPARE(update_map);
+ OBU_TEST_COMPARE(update_data);
+ OBU_TEST_COMPARE(temporal_update);
+ OBU_TEST_COMPARE(segment_id_pre_skip);
+ OBU_TEST_COMPARE(last_active_segment_id);
+ for (int i = 0; i < kMaxSegments; ++i) {
+ for (int j = 0; j < kSegmentFeatureMax; ++j) {
+ OBU_TEST_COMPARE(feature_enabled[i][j]);
+ OBU_TEST_COMPARE(feature_data[i][j]);
+ }
+ }
+ }
+
+ void VerifyDeltaParameters(const Delta& expected, const Delta& actual) {
+ OBU_TEST_COMPARE(present);
+ OBU_TEST_COMPARE(scale);
+ OBU_TEST_COMPARE(multi);
+ }
+
+ void VerifyCdefParameters(const Cdef& expected) {
+ const Cdef& actual = obu_->frame_header().cdef;
+ OBU_TEST_COMPARE(damping);
+ OBU_TEST_COMPARE(bits);
+ for (int i = 0; i < (1 << actual.bits); ++i) {
+ OBU_TEST_COMPARE(y_primary_strength[i]);
+ OBU_TEST_COMPARE(y_secondary_strength[i]);
+ OBU_TEST_COMPARE(uv_primary_strength[i]);
+ OBU_TEST_COMPARE(uv_secondary_strength[i]);
+ }
+ }
+
+ void VerifyLoopRestorationParameters(const LoopRestoration& expected) {
+ const LoopRestoration& actual = obu_->frame_header().loop_restoration;
+ for (int i = 0; i < kMaxPlanes; ++i) {
+ OBU_TEST_COMPARE(type[i]);
+ OBU_TEST_COMPARE(unit_size_log2[i]);
+ }
+ }
+
+ void VerifyGlobalMotionParameters(
+ const std::array<GlobalMotion, kNumReferenceFrameTypes>& gold) {
+ for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+ const GlobalMotion& expected = gold[i];
+ const GlobalMotion& actual = obu_->frame_header().global_motion[i];
+ OBU_TEST_COMPARE(type) << " i: " << i;
+ for (int j = 0; j < 6; ++j) {
+ OBU_TEST_COMPARE(params[j]) << " i: " << i << " j: " << j;
+ }
+ }
+ }
+
+ void VerifyFilmGrainParameters(const FilmGrainParams& expected) {
+ const FilmGrainParams& actual = obu_->frame_header().film_grain_params;
+ OBU_TEST_COMPARE(apply_grain);
+ OBU_TEST_COMPARE(update_grain);
+ OBU_TEST_COMPARE(chroma_scaling_from_luma);
+ OBU_TEST_COMPARE(overlap_flag);
+ OBU_TEST_COMPARE(clip_to_restricted_range);
+ OBU_TEST_COMPARE(num_y_points);
+ OBU_TEST_COMPARE(num_u_points);
+ OBU_TEST_COMPARE(num_v_points);
+ for (int i = 0; i < 14; ++i) {
+ OBU_TEST_COMPARE(point_y_value[i]);
+ OBU_TEST_COMPARE(point_y_scaling[i]);
+ }
+ for (int i = 0; i < 10; ++i) {
+ OBU_TEST_COMPARE(point_u_value[i]);
+ OBU_TEST_COMPARE(point_u_scaling[i]);
+ }
+ for (int i = 0; i < 10; ++i) {
+ OBU_TEST_COMPARE(point_v_value[i]);
+ OBU_TEST_COMPARE(point_v_scaling[i]);
+ }
+ OBU_TEST_COMPARE(chroma_scaling);
+ OBU_TEST_COMPARE(auto_regression_coeff_lag);
+ for (int i = 0; i < 24; ++i) {
+ OBU_TEST_COMPARE(auto_regression_coeff_y[i]);
+ }
+ for (int i = 0; i < 25; ++i) {
+ OBU_TEST_COMPARE(auto_regression_coeff_u[i]);
+ }
+ for (int i = 0; i < 25; ++i) {
+ OBU_TEST_COMPARE(auto_regression_coeff_v[i]);
+ }
+ OBU_TEST_COMPARE(auto_regression_shift);
+ OBU_TEST_COMPARE(grain_seed);
+ OBU_TEST_COMPARE(reference_index);
+ OBU_TEST_COMPARE(grain_scale_shift);
+ OBU_TEST_COMPARE(u_multiplier);
+ OBU_TEST_COMPARE(u_luma_multiplier);
+ OBU_TEST_COMPARE(u_offset);
+ OBU_TEST_COMPARE(v_multiplier);
+ OBU_TEST_COMPARE(v_luma_multiplier);
+ OBU_TEST_COMPARE(v_offset);
+ }
+
+ void VerifyTileInfoParameters(const TileInfo& expected) {
+ const TileInfo& actual = obu_->frame_header().tile_info;
+ OBU_TEST_COMPARE(uniform_spacing);
+ OBU_TEST_COMPARE(tile_columns_log2);
+ OBU_TEST_COMPARE(tile_columns);
+ for (int i = 0; i < kMaxTileColumns + 1; ++i) {
+ OBU_TEST_COMPARE(tile_column_start[i]) << "tile_column: " << i;
+ OBU_TEST_COMPARE(tile_column_width_in_superblocks[i])
+ << "tile_column: " << i;
+ }
+ OBU_TEST_COMPARE(tile_rows_log2);
+ OBU_TEST_COMPARE(tile_rows);
+ for (int i = 0; i < kMaxTileRows + 1; ++i) {
+ OBU_TEST_COMPARE(tile_row_start[i]) << "tile_row: " << i;
+ OBU_TEST_COMPARE(tile_row_height_in_superblocks[i]) << "tile_rows: " << i;
+ }
+ OBU_TEST_COMPARE(tile_count);
+ OBU_TEST_COMPARE(context_update_id);
+ OBU_TEST_COMPARE(tile_size_bytes);
+ }
+
+ void VerifySequenceHeader(const ObuSequenceHeader& expected) {
+ EXPECT_TRUE(obu_->sequence_header_changed());
+ const ObuSequenceHeader& actual = obu_->sequence_header();
+ OBU_TEST_COMPARE(profile);
+ OBU_TEST_COMPARE(still_picture);
+ OBU_TEST_COMPARE(reduced_still_picture_header);
+ OBU_TEST_COMPARE(operating_points);
+ for (int i = 0; i < actual.operating_points; ++i) {
+ OBU_TEST_COMPARE(operating_point_idc[i]) << "i: " << i;
+ OBU_TEST_COMPARE(level[i].major) << "i: " << i;
+ OBU_TEST_COMPARE(level[i].minor) << "i: " << i;
+ OBU_TEST_COMPARE(tier[i]) << "i: " << i;
+ }
+ OBU_TEST_COMPARE(frame_width_bits);
+ OBU_TEST_COMPARE(frame_height_bits);
+ OBU_TEST_COMPARE(max_frame_width);
+ OBU_TEST_COMPARE(max_frame_height);
+ OBU_TEST_COMPARE(frame_id_numbers_present);
+ if (actual.frame_id_numbers_present) {
+ OBU_TEST_COMPARE(frame_id_length_bits);
+ OBU_TEST_COMPARE(delta_frame_id_length_bits);
+ }
+ OBU_TEST_COMPARE(use_128x128_superblock);
+ OBU_TEST_COMPARE(enable_filter_intra);
+ OBU_TEST_COMPARE(enable_intra_edge_filter);
+ OBU_TEST_COMPARE(enable_interintra_compound);
+ OBU_TEST_COMPARE(enable_masked_compound);
+ OBU_TEST_COMPARE(enable_warped_motion);
+ OBU_TEST_COMPARE(enable_dual_filter);
+ OBU_TEST_COMPARE(enable_order_hint);
+ OBU_TEST_COMPARE(enable_jnt_comp);
+ OBU_TEST_COMPARE(enable_ref_frame_mvs);
+ OBU_TEST_COMPARE(choose_screen_content_tools);
+ OBU_TEST_COMPARE(force_screen_content_tools);
+ OBU_TEST_COMPARE(choose_integer_mv);
+ OBU_TEST_COMPARE(force_integer_mv);
+ OBU_TEST_COMPARE(order_hint_bits);
+ OBU_TEST_COMPARE(enable_superres);
+ OBU_TEST_COMPARE(enable_cdef);
+ OBU_TEST_COMPARE(enable_restoration);
+ OBU_TEST_COMPARE(color_config.bitdepth);
+ OBU_TEST_COMPARE(color_config.is_monochrome);
+ OBU_TEST_COMPARE(color_config.color_range);
+ OBU_TEST_COMPARE(color_config.subsampling_x);
+ OBU_TEST_COMPARE(color_config.subsampling_y);
+ OBU_TEST_COMPARE(color_config.chroma_sample_position);
+ OBU_TEST_COMPARE(timing_info_present_flag);
+ OBU_TEST_COMPARE(timing_info.num_units_in_tick);
+ OBU_TEST_COMPARE(timing_info.time_scale);
+ OBU_TEST_COMPARE(timing_info.equal_picture_interval);
+ OBU_TEST_COMPARE(timing_info.num_ticks_per_picture);
+ OBU_TEST_COMPARE(decoder_model_info_present_flag);
+ OBU_TEST_COMPARE(decoder_model_info.encoder_decoder_buffer_delay_length);
+ OBU_TEST_COMPARE(decoder_model_info.num_units_in_decoding_tick);
+ OBU_TEST_COMPARE(decoder_model_info.buffer_removal_time_length);
+ OBU_TEST_COMPARE(decoder_model_info.frame_presentation_time_length);
+ for (int i = 0; i < actual.operating_points; ++i) {
+ SCOPED_TRACE("i: " + std::to_string(i));
+ OBU_TEST_COMPARE(operating_parameters.decoder_buffer_delay[i]);
+ OBU_TEST_COMPARE(operating_parameters.encoder_buffer_delay[i]);
+ OBU_TEST_COMPARE(operating_parameters.low_delay_mode_flag[i]);
+ OBU_TEST_COMPARE(initial_display_delay[i]);
+ }
+ OBU_TEST_COMPARE(film_grain_params_present);
+ }
+
+ void VerifyMetadataHdrCll(const ObuMetadataHdrCll& expected) {
+ EXPECT_TRUE(obu_->current_frame_->hdr_cll_set());
+ const ObuMetadataHdrCll& actual = obu_->current_frame_->hdr_cll();
+ OBU_TEST_COMPARE(max_cll);
+ OBU_TEST_COMPARE(max_fall);
+ }
+
+ void VerifyMetadataHdrMdcv(const ObuMetadataHdrMdcv& expected) {
+ EXPECT_TRUE(obu_->current_frame_->hdr_mdcv_set());
+ const ObuMetadataHdrMdcv& actual = obu_->current_frame_->hdr_mdcv();
+ for (int i = 0; i < 3; ++i) {
+ OBU_TEST_COMPARE(primary_chromaticity_x[i]);
+ OBU_TEST_COMPARE(primary_chromaticity_y[i]);
+ }
+ OBU_TEST_COMPARE(white_point_chromaticity_x);
+ OBU_TEST_COMPARE(white_point_chromaticity_y);
+ OBU_TEST_COMPARE(luminance_max);
+ OBU_TEST_COMPARE(luminance_min);
+ }
+
+ void VerifyMetadataItutT35(const ObuMetadataItutT35& expected) {
+ EXPECT_TRUE(obu_->current_frame_->itut_t35_set());
+ const ObuMetadataItutT35& actual = obu_->current_frame_->itut_t35();
+ OBU_TEST_COMPARE(country_code);
+ if (actual.country_code == 0xFF) {
+ OBU_TEST_COMPARE(country_code_extension_byte);
+ }
+ ASSERT_EQ(expected.payload_size, actual.payload_size);
+ if (actual.payload_size != 0) {
+ EXPECT_EQ(memcmp(expected.payload_bytes, actual.payload_bytes,
+ actual.payload_size),
+ 0);
+ }
+ }
+
+#undef OBU_TEST_COMPARE
+
+ // Accessors to private members of ObuParser. This avoids the need for a
+ // dependency on a googletest header in the main library for FRIEND_TEST()
+ // (or the need to duplicate the implementation).
+ bool ObuParseFrameParameters() { return obu_->ParseFrameParameters(); }
+ bool ObuParseLoopFilterParameters() {
+ return obu_->ParseLoopFilterParameters();
+ }
+ bool ObuParseLoopFilterDeltaParameters() {
+ return obu_->ParseLoopFilterDeltaParameters();
+ }
+ bool ObuParseQuantizerParameters() {
+ return obu_->ParseQuantizerParameters();
+ }
+ bool ObuParseQuantizerIndexDeltaParameters() {
+ return obu_->ParseQuantizerIndexDeltaParameters();
+ }
+ void ObuComputeSegmentLosslessAndQIndex() {
+ obu_->ComputeSegmentLosslessAndQIndex();
+ }
+ bool ObuParseCdefParameters() { return obu_->ParseCdefParameters(); }
+ bool ObuParseLoopRestorationParameters() {
+ return obu_->ParseLoopRestorationParameters();
+ }
+ bool ObuParseTxModeSyntax() { return obu_->ParseTxModeSyntax(); }
+ bool ObuIsSkipModeAllowed() { return obu_->IsSkipModeAllowed(); }
+ bool ObuParseSkipModeParameters() { return obu_->ParseSkipModeParameters(); }
+ bool ObuReadAllowWarpedMotion() { return obu_->ReadAllowWarpedMotion(); }
+ bool ObuSetFrameReferences(int8_t last_frame_idx, int8_t gold_frame_idx) {
+ return obu_->SetFrameReferences(last_frame_idx, gold_frame_idx);
+ }
+
+ std::unique_ptr<BufferPool> buffer_pool_;
+ DecoderState decoder_state_;
+ std::unique_ptr<ObuParser> obu_;
+ // The following members are reset with each Init().
+ Vector<ObuHeader>* obu_headers_;
+ ObuFrameHeader* obu_frame_header_;
+ ObuSequenceHeader* obu_sequence_header_;
+ RefCountedBufferPtr current_frame_;
+};
+
+TEST_F(ObuParserTest, InvalidInputs) {
+ obu_.reset(new (std::nothrow)
+ ObuParser(nullptr, 0, 0, buffer_pool_.get(), &decoder_state_));
+ EXPECT_EQ(obu_->ParseOneFrame(¤t_frame_), kStatusInvalidArgument);
+ obu_.reset(new (std::nothrow) ObuParser(nullptr, 10, 0, buffer_pool_.get(),
+ &decoder_state_));
+ EXPECT_EQ(obu_->ParseOneFrame(¤t_frame_), kStatusInvalidArgument);
+ obu_.reset(new (std::nothrow)
+ ObuParser(kDefaultTemporalDelimiter.data(), 0, 0,
+ buffer_pool_.get(), &decoder_state_));
+ EXPECT_EQ(obu_->ParseOneFrame(¤t_frame_), kStatusInvalidArgument);
+}
+
+TEST_F(ObuParserTest, TemporalDelimiter) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultTemporalDelimiter);
+
+ ASSERT_TRUE(Parse(data.GenerateData()));
+ EXPECT_EQ(obu_->obu_headers().size(), 1);
+ EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+ VerifyObuHeader(false);
+
+ // forbidden_bit is not zero.
+ data.SetBit(0, 1);
+ EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderExtensions) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultTemporalDelimiterWithExtension);
+
+ ASSERT_TRUE(Parse(data.GenerateData()));
+ EXPECT_EQ(obu_->obu_headers().size(), 1);
+ EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+ VerifyObuHeader(true);
+
+ // extension flag is set but no extensions found.
+ data.Clear();
+ data.AppendByte(kDefaultTemporalDelimiterWithExtension[0]);
+ EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderHasSizeFieldNotSet) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultHeaderWithoutSizeField);
+
+ EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeader) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderLevel) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ // Set level to 1.
+ gold.level[0].major = 2;
+ gold.level[0].minor = 1;
+ data.SetLiteral(24, 5, 1); // level.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ // Set operating_point_idc of operating point 1 to 0x101 (temporal layer 0
+ // and spatial layer 0 should be decoded). Set level of operating point 1 to
+ // 8 (4.0) and tier to 1.
+ gold.operating_points = 2;
+ gold.operating_point_idc[1] = (1 << 0) | (1 << (0 + 8));
+ gold.level[1].major = 4;
+ gold.level[1].minor = 0;
+ gold.tier[1] = 1;
+ data.SetLiteral(7, 5, gold.operating_points - 1);
+ data.InsertLiteral(29, 12, 0x101); // operating_point_idc.
+ data.InsertLiteral(41, 5, 8); // level.
+ data.InsertBit(46, gold.tier[1]);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderProfile) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.still_picture = true;
+ data.SetBit(3, static_cast<uint8_t>(gold.still_picture));
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ // profile 2; bitdepth 8;
+ gold.profile = kProfile2;
+ gold.color_config.bitdepth = 8;
+ gold.color_config.subsampling_x = 1;
+ gold.color_config.subsampling_y = 0;
+ data.SetLiteral(0, 3, gold.profile);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ // profile 2; bitdepth 10;
+ gold.color_config.bitdepth = 10;
+ data.SetBit(73, 1); // high_bitdepth.
+ data.InsertBit(74, 0); // twelve_bit.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ // profile 2; bitdepth 12;
+ gold.color_config.bitdepth = 12;
+ gold.color_config.subsampling_y = 1;
+ data.SetBit(74, 1); // twelve_bit.
+ data.InsertBit(78, 1); // subsampling_x.
+ data.InsertBit(79, 1); // subsampling_y.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderIdLength) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.frame_id_numbers_present = true;
+ gold.delta_frame_id_length_bits = kDeltaFrameIdLengthBits;
+ gold.frame_id_length_bits = kFrameIdLengthBits;
+ data.SetBit(54, 1); // frame_id_numbers_present.
+ data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+ data.InsertLiteral(59, 3, kFrameIdLengthBits - kDeltaFrameIdLengthBits - 1);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+// An idLen greater than 16 is invalid.
+TEST_F(ObuParserTest, SequenceHeaderIdLengthInvalid) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+
+ data.SetBit(54, 1); // frame_id_numbers_present.
+ data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+ data.InsertLiteral(59, 3, 17 - kDeltaFrameIdLengthBits - 1); // idLen = 17.
+
+ ASSERT_FALSE(ParseSequenceHeader(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeaderFlags) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.enable_warped_motion = true;
+ gold.enable_superres = true;
+ data.SetBit(60, 1); // enable_warped_motion.
+ data.SetBit(70, 1); // enable_superres.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderForceScreenContentToolsEqualTo0) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.choose_screen_content_tools = false;
+ gold.force_screen_content_tools = 0;
+ gold.choose_integer_mv = false;
+ gold.force_integer_mv = 2;
+ data.SetBit(65, 0); // choose_screen_content_tools.
+ data.SetBit(66, 0); // force_screen_content_tools.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderMonochrome) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.color_config.is_monochrome = true;
+ gold.color_config.color_range = kColorRangeFull;
+ data.SetBit(74, 1); // monochrome.
+ data.InsertBit(76, 1); // color_range.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+// This tests TimingInfo, DecoderModelInfo and OperatingParameters. The test is
+// kind of long but it is the simplest way to test all three since they are
+// dependent on one another.
+TEST_F(ObuParserTest, SequenceHeaderTimingInfo) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.timing_info_present_flag = true;
+ gold.timing_info.num_units_in_tick = 100;
+ gold.timing_info.time_scale = 1000;
+ gold.timing_info.equal_picture_interval = false;
+ gold.decoder_model_info_present_flag = false;
+ data.SetBit(5, static_cast<uint8_t>(gold.timing_info_present_flag));
+ data.InsertLiteral(6, 32, gold.timing_info.num_units_in_tick);
+ data.InsertLiteral(38, 32, gold.timing_info.time_scale);
+ data.InsertBit(70,
+ static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+ data.InsertBit(71,
+ static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ gold.timing_info.equal_picture_interval = true;
+ gold.timing_info.num_ticks_per_picture = 7;
+ data.SetBit(70,
+ static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+ EXPECT_EQ(data.InsertUvlc(71, gold.timing_info.num_ticks_per_picture - 1), 5);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ gold.decoder_model_info_present_flag = true;
+ gold.decoder_model_info.encoder_decoder_buffer_delay_length = 5;
+ gold.decoder_model_info.num_units_in_decoding_tick = 1000;
+ gold.decoder_model_info.buffer_removal_time_length = 18;
+ gold.decoder_model_info.frame_presentation_time_length = 20;
+
+ data.SetBit(76, static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+ data.InsertLiteral(
+ 77, 5, gold.decoder_model_info.encoder_decoder_buffer_delay_length - 1);
+ data.InsertLiteral(82, 32,
+ gold.decoder_model_info.num_units_in_decoding_tick);
+ data.InsertLiteral(114, 5,
+ gold.decoder_model_info.buffer_removal_time_length - 1);
+ data.InsertLiteral(
+ 119, 5, gold.decoder_model_info.frame_presentation_time_length - 1);
+ data.InsertBit(147, 0); // decoder_model_present_for_this_op.
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+
+ gold.operating_parameters.decoder_buffer_delay[0] = 10;
+ gold.operating_parameters.encoder_buffer_delay[0] = 20;
+ gold.operating_parameters.low_delay_mode_flag[0] = true;
+
+ data.SetBit(147, 1); // decoder_model_present_for_this_op.
+ data.InsertLiteral(
+ 148, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+ gold.operating_parameters.decoder_buffer_delay[0]);
+ data.InsertLiteral(
+ 153, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+ gold.operating_parameters.encoder_buffer_delay[0]);
+ data.InsertBit(158, static_cast<uint8_t>(
+ gold.operating_parameters.low_delay_mode_flag[0]));
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderInitialDisplayDelay) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultSequenceHeader);
+ ObuSequenceHeader gold;
+ DefaultSequenceHeader(&gold);
+
+ gold.initial_display_delay[0] = 8;
+
+ data.SetBit(6, 1); // initial_display_delay_present_flag.
+ data.InsertBit(29, 1); // initial_display_delay_present_for_this_op.
+ data.InsertLiteral(30, 4, gold.initial_display_delay[0] - 1);
+
+ ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+ VerifySequenceHeader(gold);
+}
+
+// Parsing of a frame header should fail if no sequence header has been
+// received.
+TEST_F(ObuParserTest, FrameHeaderWithoutSequenceHeader) {
+ // The aom-test-data test vector av1-1-b8-01-size-16x16.ivf has two temporal
+ // units. The first temporal unit has a presentation timestamp of 0 and
+ // consists of three OBUs: a temporal delimiter OBU, a sequence header OBU,
+ // and a frame OBU.
+ const std::vector<uint8_t> kTemporalDelimiter = {0x12, 0x00};
+ const std::vector<uint8_t> kSequenceHeader = {
+ 0x0a, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x9f, 0xfb, 0xff, 0xf3, 0x00, 0x80};
+ const std::vector<uint8_t> kFrame = {
+ 0x32, 0xa6, 0x01, 0x10, 0x00, 0x87, 0x80, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x40, 0x00, 0x9e, 0x86, 0x5b, 0xb2, 0x22, 0xb5, 0x58, 0x4d, 0x68, 0xe6,
+ 0x37, 0x54, 0x42, 0x7b, 0x84, 0xce, 0xdf, 0x9f, 0xec, 0xab, 0x07, 0x4d,
+ 0xf6, 0xe1, 0x5e, 0x9e, 0x27, 0xbf, 0x93, 0x2f, 0x47, 0x0d, 0x7b, 0x7c,
+ 0x45, 0x8d, 0xcf, 0x26, 0xf7, 0x6c, 0x06, 0xd7, 0x8c, 0x2e, 0xf5, 0x2c,
+ 0xb0, 0x8a, 0x31, 0xac, 0x69, 0xf5, 0xcd, 0xd8, 0x71, 0x5d, 0xaf, 0xf8,
+ 0x96, 0x43, 0x8c, 0x9c, 0x23, 0x6f, 0xab, 0xd0, 0x35, 0x43, 0xdf, 0x81,
+ 0x12, 0xe3, 0x7d, 0xec, 0x22, 0xb0, 0x30, 0x54, 0x32, 0x9f, 0x90, 0xc0,
+ 0x5d, 0x64, 0x9b, 0x0f, 0x75, 0x31, 0x84, 0x3a, 0x57, 0xd7, 0x5f, 0x03,
+ 0x6e, 0x7f, 0x43, 0x17, 0x6d, 0x08, 0xc3, 0x81, 0x8a, 0xae, 0x73, 0x1c,
+ 0xa8, 0xa7, 0xe4, 0x9c, 0xa9, 0x5b, 0x3f, 0xd1, 0xeb, 0x75, 0x3a, 0x7f,
+ 0x22, 0x77, 0x38, 0x64, 0x1c, 0x77, 0xdb, 0xcd, 0xef, 0xb7, 0x08, 0x45,
+ 0x8e, 0x7f, 0xea, 0xa3, 0xd0, 0x81, 0xc9, 0xc1, 0xbc, 0x93, 0x9b, 0x41,
+ 0xb1, 0xa1, 0x42, 0x17, 0x98, 0x3f, 0x1e, 0x95, 0xdf, 0x68, 0x7c, 0xb7,
+ 0x98};
+
+ BytesAndBits data;
+ data.AppendBytes(kTemporalDelimiter);
+ // Skip the sequence header OBU.
+ data.AppendBytes(kFrame);
+ ASSERT_FALSE(Parse(data.GenerateData()));
+
+ // Now verify that all three OBUs are correct, by adding them to |data|
+ // successively.
+ data.Clear();
+ data.AppendBytes(kTemporalDelimiter);
+ ASSERT_TRUE(Parse(data.GenerateData()));
+ data.Clear();
+ data.AppendBytes(kTemporalDelimiter);
+ data.AppendBytes(kSequenceHeader);
+ ASSERT_TRUE(Parse(data.GenerateData()));
+ data.Clear();
+ data.AppendBytes(kTemporalDelimiter);
+ data.AppendBytes(kSequenceHeader);
+ data.AppendBytes(kFrame);
+ ASSERT_TRUE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrame) {
+ BytesAndBits data;
+ data.AppendBit(1); // show_existing_frame.
+ data.AppendLiteral(3, kFrameToShow); // frame_to_show.
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+ gold.show_existing_frame = true;
+ gold.frame_to_show = kFrameToShow;
+
+ // kFrameToShow'th frame is not yet decoded.
+ ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+ decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+ // kFrameToShow'th frame is not a showable frame.
+ ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+ decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParametersShowExistingFrameWithDisplayFrameId) {
+ BytesAndBits data;
+ data.AppendBit(1); // show_existing_frame.
+ data.AppendLiteral(3, kFrameToShow); // frame_to_show.
+ data.AppendLiteral(15, kDisplayFrameId); // display_frame_id.
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+ gold.show_existing_frame = true;
+ gold.frame_to_show = kFrameToShow;
+ gold.display_frame_id = kDisplayFrameId;
+
+ // kFrameToShow'th frame is not yet decoded.
+ ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+ decoder_state_.reference_frame_id[kFrameToShow] = kDisplayFrameId;
+ decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+ // kFrameToShow'th frame is not a showable frame.
+ ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+ decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), true));
+ VerifyFrameParameters(gold, true);
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrameTemporalPointInfo) {
+ BytesAndBits data;
+ data.AppendBit(1); // show_existing_frame.
+ data.AppendLiteral(3, kFrameToShow); // frame_to_show.
+ data.AppendLiteral(20, 38); // frame_presentation_time.
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+ gold.show_existing_frame = true;
+ gold.frame_to_show = kFrameToShow;
+ gold.frame_presentation_time = 38;
+
+ EXPECT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+ obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+ obu_sequence_header_->max_frame_width = kWidth;
+ obu_sequence_header_->max_frame_height = kHeight;
+
+ obu_sequence_header_->decoder_model_info_present_flag = true;
+ obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+ decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+ decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+
+ ASSERT_TRUE(ObuParseFrameParameters());
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterErrorResilientMode) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+ gold.error_resilient_mode = true;
+ data.SetBit(4, static_cast<uint8_t>(gold.error_resilient_mode));
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrame) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameTemporalPointInfo) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+
+ data.InsertLiteral(4, 20, 38); // frame_presentation_time.
+ gold.frame_presentation_time = 38;
+
+ EXPECT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+ obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+ obu_sequence_header_->max_frame_width = kWidth;
+ obu_sequence_header_->max_frame_height = kHeight;
+
+ obu_sequence_header_->decoder_model_info_present_flag = true;
+ obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+ ASSERT_TRUE(ObuParseFrameParameters());
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameOverrideSize) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+
+ OverrideFrameSize(&data, &gold, 5, 6);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+
+ OverrideRenderSize(&data, &gold, 23);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameSuperRes) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+ gold.use_superres = true;
+ gold.superres_scale_denominator = 15;
+ gold.width = kWidth * 8 / 15;
+ gold.columns4x4 = 58;
+
+ data.SetBit(6, static_cast<int>(gold.use_superres));
+ data.SetLiteral(7, 3, gold.superres_scale_denominator - 9);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 0, 0, true));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameAllowScreenContentTools) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameKey);
+
+ data.InsertBit(5, 1); // allow_screen_content_tools.
+ data.InsertBit(8, 1); // allow_intrabc.
+ gold.allow_screen_content_tools = true;
+ gold.allow_intrabc = true;
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2));
+ VerifyFrameParameters(gold);
+
+ data.InsertBit(6, 1); // force_integer_mv.
+ gold.force_integer_mv = 1;
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+ VerifyFrameParameters(gold);
+
+ data.SetBit(6, 0); // force_integer_mv.
+
+ // Gold need not be updated, because force_integer_mv is always 1 for
+ // keyframes.
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrame) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameOverrideSize) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+ OverrideFrameSize(&data, &gold, 6, 15);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+
+ OverrideRenderSize(&data, &gold, 32);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+// An INTRA_ONLY_FRAME cannot set refresh_frame_flags to 0xff.
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameRefreshAllFrames) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+ data.SetLiteral(7, 8, 0xFF); // refresh_frame_flags.
+
+ ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrame) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderInterFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameInter);
+ ObuFrameHeader reference_frame_header;
+ reference_frame_header.width = kWidth;
+ reference_frame_header.height = kHeight;
+ reference_frame_header.render_width = kWidth;
+ reference_frame_header.render_height = kHeight;
+ reference_frame_header.upscaled_width = kWidth;
+ reference_frame_header.rows4x4 = kRows4x4;
+ reference_frame_header.columns4x4 = kColumns4x4;
+ reference_frame_header.refresh_frame_flags = 0;
+ for (auto& reference_frame : decoder_state_.reference_frame) {
+ reference_frame = buffer_pool_->GetFreeBuffer();
+ EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+ }
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrameOverrideSize) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderInterFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameInter);
+ ObuFrameHeader reference_frame_header;
+ reference_frame_header.width = kWidth;
+ reference_frame_header.height = kHeight;
+ reference_frame_header.render_width = kWidth;
+ reference_frame_header.render_height = kHeight;
+ reference_frame_header.upscaled_width = kWidth;
+ reference_frame_header.rows4x4 = kRows4x4;
+ reference_frame_header.columns4x4 = kColumns4x4;
+ reference_frame_header.refresh_frame_flags = 0;
+ for (auto& reference_frame : decoder_state_.reference_frame) {
+ reference_frame = buffer_pool_->GetFreeBuffer();
+ EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+ }
+
+ data.InsertLiteral(39, kNumInterReferenceFrameTypes, 0); // found_ref.
+ OverrideFrameSize(&data, &gold, 6, 46);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+
+ OverrideRenderSize(&data, &gold, 63);
+
+ ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+ VerifyFrameParameters(gold);
+}
+
+// This test verifies we check the following requirement at the end of Section
+// 6.8.4:
+// If FrameIsIntra is equal to 0 (indicating that this frame may use inter
+// prediction), the requirements described in the frame size with refs
+// semantics of section 6.8.6 must also be satisfied.
+TEST_F(ObuParserTest, FrameParameterInterFrameInvalidSize) {
+ BytesAndBits data;
+ data.AppendBytes(kDefaultFrameHeaderInterFrame);
+ ObuFrameHeader gold;
+ DefaultFrameHeader(&gold, kFrameInter);
+ ObuFrameHeader reference_frame_header;
+ reference_frame_header.width = kWidth;
+ reference_frame_header.height = 2 * kHeight + 8;
+ reference_frame_header.render_width = kWidth;
+ reference_frame_header.render_height = 2 * kHeight + 8;
+ reference_frame_header.upscaled_width = kWidth;
+ reference_frame_header.rows4x4 = 2 * kRows4x4 + 2;
+ reference_frame_header.columns4x4 = kColumns4x4;
+ reference_frame_header.refresh_frame_flags = 0;
+ for (auto& reference_frame : decoder_state_.reference_frame) {
+ reference_frame = buffer_pool_->GetFreeBuffer();
+ EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+ }
+
+ EXPECT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+// Tests the ObuParser::SetFrameReferences() method.
+//
+// This method uses the following data members as input:
+// decoder_state_.reference_order_hint
+// sequence_header_.enable_order_hint
+// sequence_header_.order_hint_bits
+// frame_header_.order_hint
+// So we need to set up these data members before calling
+// ObuParser::SetFrameReferences().
+//
+// The output is in frame_header_.reference_frame_index.
+TEST_F(ObuParserTest, SetFrameReferences) {
+ // All reference frames are forward references (because 9 < 17).
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ decoder_state_.reference_order_hint[i] = 9;
+ }
+
+ ASSERT_TRUE(Init());
+ obu_sequence_header_->enable_order_hint = true;
+ obu_sequence_header_->order_hint_bits = 5;
+ obu_sequence_header_->order_hint_shift_bits =
+ Mod32(32 - obu_sequence_header_->order_hint_bits);
+ obu_frame_header_->order_hint = 17;
+
+ const int8_t last_frame_idx = 0;
+ const int8_t gold_frame_idx = 1;
+
+ // Since all reference frames are forward references, we set the remaining
+ // five references in reverse chronological order. So Last2, Last3, Backward,
+ // Alternate2, and Alternate are set to 7, 6, 5, 4, and 3, respectively.
+
+ EXPECT_TRUE(ObuSetFrameReferences(last_frame_idx, gold_frame_idx));
+
+ EXPECT_EQ(
+ obu_frame_header_
+ ->reference_frame_index[kReferenceFrameLast - kReferenceFrameLast],
+ 0);
+ EXPECT_EQ(
+ obu_frame_header_
+ ->reference_frame_index[kReferenceFrameLast2 - kReferenceFrameLast],
+ 7);
+ EXPECT_EQ(
+ obu_frame_header_
+ ->reference_frame_index[kReferenceFrameLast3 - kReferenceFrameLast],
+ 6);
+ EXPECT_EQ(
+ obu_frame_header_
+ ->reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast],
+ 1);
+ EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameBackward -
+ kReferenceFrameLast],
+ 5);
+ EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate2 -
+ kReferenceFrameLast],
+ 4);
+ EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate -
+ kReferenceFrameLast],
+ 3);
+}
+
+TEST_F(ObuParserTest, LoopFilterParameters) {
+ LoopFilter gold;
+ memset(&gold, 0, sizeof(gold));
+
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+ obu_frame_header_->coded_lossless = true;
+ gold.ref_deltas[kReferenceFrameIntra] = 1;
+ gold.ref_deltas[kReferenceFrameGolden] = -1;
+ gold.ref_deltas[kReferenceFrameAlternate] = -1;
+ gold.ref_deltas[kReferenceFrameAlternate2] = -1;
+ ASSERT_TRUE(ObuParseLoopFilterParameters());
+ VerifyLoopFilterParameters(gold);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+ obu_frame_header_->allow_intrabc = true;
+ ASSERT_TRUE(ObuParseLoopFilterParameters());
+ VerifyLoopFilterParameters(gold);
+
+ gold.level[0] = 32;
+ gold.level[3] = 48;
+ gold.sharpness = 4;
+ data.Clear();
+ for (const auto& level : gold.level) {
+ data.AppendLiteral(6, level);
+ }
+ data.AppendLiteral(3, gold.sharpness);
+ data.AppendBit(0); // delta_enabled.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+ ASSERT_TRUE(ObuParseLoopFilterParameters());
+ VerifyLoopFilterParameters(gold);
+
+ gold.delta_enabled = true;
+ gold.delta_update = true;
+ gold.ref_deltas[0] = 20;
+ gold.mode_deltas[0] = -20;
+ data.SetBit(27, 1); // delta_enabled.
+ data.AppendBit(1); // delta_update.
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ if (i == 0) {
+ data.AppendBit(1); // update_ref_delta.
+ data.AppendInverseSignedLiteral(6, gold.ref_deltas[0]); // ref_delta.
+ } else {
+ data.AppendBit(0); // update_ref_delta.
+ }
+ }
+ for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+ if (i == 0) {
+ data.AppendBit(1); // update_mode_delta.
+ data.AppendInverseSignedLiteral(6, gold.mode_deltas[0]); // mode_delta.
+ } else {
+ data.AppendBit(0); // update_mode_delta.
+ }
+ }
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+ ASSERT_TRUE(ObuParseLoopFilterParameters());
+ VerifyLoopFilterParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParameters) {
+ QuantizerParameters gold = {};
+ gold.base_index = 48;
+
+ BytesAndBits data;
+ data.AppendLiteral(8, gold.base_index);
+ data.AppendLiteral(3, 0); // delta_coded.
+ data.AppendBit(0); // use_matrix.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersMonochrome) {
+ QuantizerParameters gold = {};
+ gold.base_index = 48;
+
+ BytesAndBits data;
+ data.AppendLiteral(8, gold.base_index);
+ data.AppendBit(0); // delta_coded.
+ data.AppendBit(0); // use_matrix.
+ // The quantizer parameters end here. Add a 1 bit. It should not be parsed.
+ data.AppendBit(1); // Would be segmentation_enabled in a bitstream.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.is_monochrome = true;
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersDeltaCoded) {
+ QuantizerParameters gold = {};
+ gold.base_index = 48;
+ gold.delta_dc[kPlaneY] = -30;
+
+ BytesAndBits data;
+ data.AppendLiteral(8, gold.base_index);
+ data.AppendBit(1); // delta_coded.
+ data.AppendInverseSignedLiteral(6, gold.delta_dc[kPlaneY]);
+ data.AppendLiteral(2, 0); // delta_coded u dc/ac.
+ data.AppendBit(0); // use_matrix.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ gold.delta_dc[kPlaneU] = -40;
+ gold.delta_dc[kPlaneV] = gold.delta_dc[kPlaneU];
+ data.SetBit(16, 1); // delta_coded.
+ data.InsertInverseSignedLiteral(17, 6, gold.delta_dc[kPlaneU]);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ gold.delta_ac[kPlaneU] = 50;
+ gold.delta_ac[kPlaneV] = gold.delta_ac[kPlaneU];
+ data.SetBit(24, 1); // delta_coded.
+ data.InsertInverseSignedLiteral(25, 6, gold.delta_ac[kPlaneU]);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ gold.delta_dc[kPlaneV] = 60;
+ gold.delta_ac[kPlaneV] = 0;
+ data.InsertBit(16, 1); // diff_uv_delta.
+ data.InsertBit(33, 1); // delta_coded.
+ data.InsertInverseSignedLiteral(34, 6, gold.delta_dc[kPlaneV]);
+ data.InsertBit(41, 0); // delta_coded.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.separate_uv_delta_q = true;
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ gold.delta_ac[kPlaneV] = -20;
+ data.SetBit(41, 1); // delta_coded.
+ data.InsertInverseSignedLiteral(42, 6, gold.delta_ac[kPlaneV]);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.separate_uv_delta_q = true;
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersUseQmatrix) {
+ QuantizerParameters gold = {};
+ gold.base_index = 48;
+ gold.use_matrix = true;
+ gold.matrix_level[kPlaneY] = 3;
+ gold.matrix_level[kPlaneU] = 6;
+ gold.matrix_level[kPlaneV] = gold.matrix_level[kPlaneU];
+
+ // Test three cases.
+ // 1. separate_uv_delta_q = false (which implies diff_uv_delta = false).
+ BytesAndBits data;
+ data.AppendLiteral(8, gold.base_index);
+ data.AppendLiteral(3, 0); // delta_coded.
+ data.AppendBit(static_cast<uint8_t>(gold.use_matrix));
+ data.AppendLiteral(4, gold.matrix_level[kPlaneY]);
+ data.AppendLiteral(4, gold.matrix_level[kPlaneU]);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ // 2. separate_uv_delta_q = true and diff_uv_delta = false.
+ gold.matrix_level[kPlaneV] = 5;
+ data.InsertBit(9, 0); // diff_uv_delta.
+ data.AppendLiteral(4, gold.matrix_level[kPlaneV]);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.separate_uv_delta_q = true;
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+
+ // 3. separate_uv_delta_q = true and diff_uv_delta = true.
+ data.SetBit(9, 1); // diff_uv_delta.
+ data.InsertLiteral(12, 2, 0); // delta_coded.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.separate_uv_delta_q = true;
+ ASSERT_TRUE(ObuParseQuantizerParameters());
+ VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, SegmentationParameters) {
+ const int kPrimaryReferenceNotNone = 1;
+ const int kPrevFrameIndexNotNone = 2;
+
+ // Set up decoder_state_ with a previous frame containing saved segmentation
+ // parameters.
+ decoder_state_.reference_frame[kPrevFrameIndexNotNone] =
+ buffer_pool_->GetFreeBuffer();
+ ASSERT_NE(decoder_state_.reference_frame[kPrevFrameIndexNotNone], nullptr);
+ Segmentation prev_segmentation = {};
+ prev_segmentation.feature_enabled[2][0] = true;
+ prev_segmentation.feature_enabled[5][0] = true;
+ prev_segmentation.last_active_segment_id = 5;
+ decoder_state_.reference_frame[kPrevFrameIndexNotNone]
+ ->SetSegmentationParameters(prev_segmentation);
+
+ Segmentation gold;
+ memset(&gold, 0, sizeof(gold));
+
+ BytesAndBits data;
+ data.AppendBit(0); // segmentation_enabled.
+
+ // Since segmentation_enabled is false, we expect the parameters to be all
+ // zero/false.
+ ASSERT_TRUE(ParseSegmentationParameters(
+ data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+ VerifySegmentationParameters(gold);
+
+ gold.enabled = true;
+ gold.update_map = true;
+ gold.temporal_update = true;
+ data.SetBit(0, static_cast<uint8_t>(gold.enabled));
+ data.AppendBit(static_cast<uint8_t>(gold.update_map));
+ data.AppendBit(static_cast<uint8_t>(gold.temporal_update));
+ data.AppendBit(static_cast<uint8_t>(gold.update_data));
+
+ // Since update_data is false, we expect the parameters to be loaded from the
+ // previous frame in |decoder_state_|. So change |gold| accordingly.
+ gold.feature_enabled[2][0] = true;
+ gold.feature_enabled[5][0] = true;
+ gold.last_active_segment_id = 5;
+
+ ASSERT_TRUE(ParseSegmentationParameters(
+ data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+ VerifySegmentationParameters(gold);
+
+ OverrideSegmentation(&data, &gold, 3);
+
+ ASSERT_TRUE(ParseSegmentationParameters(
+ data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+ VerifySegmentationParameters(gold);
+
+ // If primary_ref_frame is kPrimaryReferenceNone, these three fields are
+ // implied.
+ data.RemoveBit(1); // segmentation_update_map.
+ data.RemoveBit(1); // segmentation_temporal_update.
+ data.RemoveBit(1); // segmentation_update_data.
+ gold.update_map = true;
+ gold.temporal_update = false;
+ gold.update_data = true;
+
+ // Since update_data is true, we expect the parameters to be read from
+ // |data|.
+ ASSERT_TRUE(ParseSegmentationParameters(data.GenerateData(),
+ kPrimaryReferenceNone, 0));
+ VerifySegmentationParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerIndexDeltaParameters) {
+ BytesAndBits data;
+ data.AppendBit(1); // delta_q_present.
+ data.AppendLiteral(2, 2); // delta_q_res.
+
+ Delta gold;
+ memset(&gold, 0, sizeof(gold));
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+ VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+
+ gold.present = true;
+ gold.scale = 2;
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->quantizer.base_index = 40;
+ ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+ VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+}
+
+TEST_F(ObuParserTest, LoopFilterDeltaParameters) {
+ BytesAndBits data;
+ data.AppendBit(1); // delta_lf_present.
+ data.AppendLiteral(2, 2); // delta_lf_res.
+ data.AppendBit(1); // delta_lf_multi.
+
+ Delta gold;
+ memset(&gold, 0, sizeof(gold));
+
+ // delta_q_present is false, so loop filter delta will not be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+ VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+ // allow_intrabc is true, so loop filter delta will not be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->delta_q.present = true;
+ obu_frame_header_->allow_intrabc = true;
+ ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+ VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+ gold.present = true;
+ gold.scale = 2;
+ gold.multi = true;
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->delta_q.present = true;
+ ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+ VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+}
+
+TEST_F(ObuParserTest, ComputeSegmentLosslessAndQIndex) {
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+
+ // Segmentation is disabled. All quantizers are 0.
+ ObuComputeSegmentLosslessAndQIndex();
+ EXPECT_TRUE(obu_->frame_header().coded_lossless);
+ EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+ for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+ EXPECT_EQ(qindex, 0);
+ }
+
+ // Segmentation is enabled. All quantizers are zero.
+ obu_frame_header_->segmentation.enabled = true;
+ ObuComputeSegmentLosslessAndQIndex();
+ EXPECT_TRUE(obu_->frame_header().coded_lossless);
+ EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+ for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+ EXPECT_EQ(qindex, 0);
+ }
+
+ // Segmentation is enabled. All quantizers are zero. upscaled_width != width.
+ obu_frame_header_->segmentation.enabled = true;
+ obu_frame_header_->upscaled_width = 100;
+ ObuComputeSegmentLosslessAndQIndex();
+ EXPECT_TRUE(obu_->frame_header().coded_lossless);
+ EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+ for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+ EXPECT_EQ(qindex, 0);
+ }
+
+ // Segmentation in disabled. Some quantizer deltas are non zero.
+ obu_frame_header_->segmentation.enabled = false;
+ obu_frame_header_->quantizer.delta_dc[kPlaneY] = 40;
+ ObuComputeSegmentLosslessAndQIndex();
+ EXPECT_FALSE(obu_->frame_header().coded_lossless);
+ EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+ for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+ EXPECT_EQ(qindex, 0);
+ }
+
+ // Segmentation is disabled. Quantizer base index is non zero.
+ obu_frame_header_->segmentation.enabled = true;
+ obu_frame_header_->quantizer.delta_dc[kPlaneY] = 0;
+ obu_frame_header_->quantizer.base_index = 40;
+ ObuComputeSegmentLosslessAndQIndex();
+ EXPECT_FALSE(obu_->frame_header().coded_lossless);
+ EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+ for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+ EXPECT_EQ(qindex, 40);
+ }
+}
+
+TEST_F(ObuParserTest, CdefParameters) {
+ Cdef gold;
+ memset(&gold, 0, sizeof(gold));
+ const int coeff_shift = 2; // bitdepth - 8.
+ gold.damping = 3 + coeff_shift;
+
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->color_config.bitdepth = 10;
+ ASSERT_TRUE(ObuParseCdefParameters());
+ // Cdef will be {0} except for damping because enable_cdef is false.
+ VerifyCdefParameters(gold);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_cdef = true;
+ obu_sequence_header_->color_config.bitdepth = 10;
+ obu_frame_header_->coded_lossless = true;
+ ASSERT_TRUE(ObuParseCdefParameters());
+ // Cdef will be {0} except for damping because coded_lossless is true.
+ VerifyCdefParameters(gold);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_cdef = true;
+ obu_sequence_header_->color_config.bitdepth = 10;
+ obu_frame_header_->allow_intrabc = true;
+ ASSERT_TRUE(ObuParseCdefParameters());
+ // Cdef will be {0} except for damping because allow_intrabc is true.
+ VerifyCdefParameters(gold);
+
+ gold.damping = 5;
+ gold.bits = 1;
+ data.Clear();
+ data.AppendLiteral(2, gold.damping - 3); // cdef_damping_minus3.
+ gold.damping += coeff_shift;
+ data.AppendLiteral(2, gold.bits); // cdef_bits.
+ for (int i = 0; i < 2; ++i) {
+ gold.y_primary_strength[i] = 10;
+ gold.y_secondary_strength[i] = (i == 0) ? 2 : 3;
+ gold.uv_primary_strength[i] = 12;
+ gold.uv_secondary_strength[i] = (i == 1) ? 2 : 3;
+ data.AppendLiteral(4, gold.y_primary_strength[i]);
+ data.AppendLiteral(2, gold.y_secondary_strength[i]);
+ data.AppendLiteral(4, gold.uv_primary_strength[i]);
+ data.AppendLiteral(2, gold.uv_secondary_strength[i]);
+ if (gold.y_secondary_strength[i] == 3) ++gold.y_secondary_strength[i];
+ if (gold.uv_secondary_strength[i] == 3) ++gold.uv_secondary_strength[i];
+ gold.y_primary_strength[i] <<= coeff_shift;
+ gold.uv_primary_strength[i] <<= coeff_shift;
+ gold.y_secondary_strength[i] <<= coeff_shift;
+ gold.uv_secondary_strength[i] <<= coeff_shift;
+ }
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_cdef = true;
+ obu_sequence_header_->color_config.bitdepth = 10;
+ ASSERT_TRUE(ObuParseCdefParameters());
+ VerifyCdefParameters(gold);
+}
+
+TEST_F(ObuParserTest, LoopRestorationParameters) {
+ for (bool use_128x128_superblock : testing::Bool()) {
+ SCOPED_TRACE("use_128x128_superblock: " +
+ std::to_string(use_128x128_superblock));
+ LoopRestoration gold;
+ memset(&gold, 0, sizeof(gold));
+
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ // enable_restoration is false. nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->allow_intrabc = true;
+ obu_frame_header_->coded_lossless = true;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+
+ // allow_intrabc is true. nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->allow_intrabc = true;
+ obu_sequence_header_->enable_restoration = true;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+
+ // coded_lossless is true. nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->coded_lossless = true;
+ obu_sequence_header_->enable_restoration = true;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+
+ data.Clear();
+ for (int i = 0; i < kMaxPlanes; ++i) {
+ data.AppendLiteral(2, kLoopRestorationTypeNone); // lr_type.
+ }
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_restoration = true;
+ obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+
+ gold.type[0] = gold.type[1] = kLoopRestorationTypeWiener;
+ gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+ use_128x128_superblock ? 8 : 7;
+ data.SetLiteral(0, 2, gold.type[0]); // lr_type.
+ data.SetLiteral(2, 2, gold.type[0]); // lr_type.
+ data.AppendBit(1); // lr_unit_shift.
+ if (!use_128x128_superblock) {
+ data.AppendBit(0); // lr_unit_extra_shift.
+ }
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_restoration = true;
+ obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+
+ if (!use_128x128_superblock) {
+ gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+ 8;
+ data.SetBit(7, 1); // lr_unit_extra_shift.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_restoration = true;
+ obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+ }
+
+ gold.unit_size_log2[1] = gold.unit_size_log2[2] = 7;
+ data.AppendBit(1); // lr_uv_shift.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_sequence_header_->enable_restoration = true;
+ obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+ obu_sequence_header_->color_config.subsampling_x = 1;
+ obu_sequence_header_->color_config.subsampling_y = 1;
+ ASSERT_TRUE(ObuParseLoopRestorationParameters());
+ VerifyLoopRestorationParameters(gold);
+ }
+}
+
+TEST_F(ObuParserTest, TxModeSyntax) {
+ BytesAndBits data;
+ data.AppendBit(1); // tx_mode_select.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseTxModeSyntax());
+ EXPECT_EQ(kTxModeSelect, obu_->frame_header().tx_mode);
+
+ data.SetBit(0, 0); // tx_mode_select.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ ASSERT_TRUE(ObuParseTxModeSyntax());
+ EXPECT_EQ(kTxModeLargest, obu_->frame_header().tx_mode);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->coded_lossless = true;
+ ASSERT_TRUE(ObuParseTxModeSyntax());
+ EXPECT_EQ(kTxModeOnly4x4, obu_->frame_header().tx_mode);
+}
+
+TEST_F(ObuParserTest, FrameReferenceModeSyntax) {
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameKey));
+ EXPECT_FALSE(obu_->frame_header().reference_mode_select);
+
+ data.SetBit(0, 1); // reference_mode_select.
+
+ ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameInter));
+ EXPECT_TRUE(obu_->frame_header().reference_mode_select);
+}
+
+TEST_F(ObuParserTest, SkipModeParameters) {
+ BytesAndBits data;
+ data.AppendBit(1); // skip_mode_present.
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameKey;
+ ASSERT_FALSE(ObuIsSkipModeAllowed());
+ ASSERT_TRUE(ObuParseSkipModeParameters());
+ EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->reference_mode_select = true;
+ ASSERT_FALSE(ObuIsSkipModeAllowed());
+ ASSERT_TRUE(ObuParseSkipModeParameters());
+ EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->reference_mode_select = true;
+ obu_sequence_header_->enable_order_hint = true;
+ obu_sequence_header_->order_hint_bits = 7;
+ obu_sequence_header_->order_hint_shift_bits =
+ Mod32(32 - obu_sequence_header_->order_hint_bits);
+ ASSERT_FALSE(ObuIsSkipModeAllowed());
+ ASSERT_TRUE(ObuParseSkipModeParameters());
+ EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->reference_mode_select = true;
+ obu_frame_header_->order_hint = 1;
+ decoder_state_.order_hint = 1;
+ obu_sequence_header_->enable_order_hint = true;
+ obu_sequence_header_->order_hint_bits = 7;
+ obu_sequence_header_->order_hint_shift_bits =
+ Mod32(32 - obu_sequence_header_->order_hint_bits);
+ ASSERT_FALSE(ObuIsSkipModeAllowed());
+ ASSERT_TRUE(ObuParseSkipModeParameters());
+ EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+ ASSERT_TRUE(Init(data.GenerateData()));
+ for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+ obu_frame_header_->reference_frame_index[i] = i;
+ decoder_state_.reference_order_hint[i] = i;
+ }
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->reference_mode_select = true;
+ obu_frame_header_->order_hint = 1;
+ decoder_state_.order_hint = 1;
+ obu_sequence_header_->enable_order_hint = true;
+ obu_sequence_header_->order_hint_bits = 7;
+ obu_sequence_header_->order_hint_shift_bits =
+ Mod32(32 - obu_sequence_header_->order_hint_bits);
+ ASSERT_TRUE(ObuIsSkipModeAllowed());
+ ASSERT_TRUE(ObuParseSkipModeParameters());
+ EXPECT_TRUE(obu_->frame_header().skip_mode_present);
+}
+
+TEST_F(ObuParserTest, AllowWarpedMotion) {
+ BytesAndBits data;
+ data.AppendBit(0xff); // dummy.
+
+ // IsIntraFrame is true, so nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameKey;
+ obu_frame_header_->error_resilient_mode = false;
+ obu_sequence_header_->enable_warped_motion = true;
+ ASSERT_TRUE(ObuReadAllowWarpedMotion());
+ EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+ // error_resilient_mode is true, so nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->error_resilient_mode = true;
+ obu_sequence_header_->enable_warped_motion = true;
+ ASSERT_TRUE(ObuReadAllowWarpedMotion());
+ EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+ // enable_warped_motion is false, so nothing will be read.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->error_resilient_mode = false;
+ obu_sequence_header_->enable_warped_motion = false;
+ ASSERT_TRUE(ObuReadAllowWarpedMotion());
+ EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+ // allow_warped_motion will be read and equal to true.
+ ASSERT_TRUE(Init(data.GenerateData()));
+ obu_frame_header_->frame_type = kFrameInter;
+ obu_frame_header_->error_resilient_mode = false;
+ obu_sequence_header_->enable_warped_motion = true;
+ ASSERT_TRUE(ObuReadAllowWarpedMotion());
+ EXPECT_TRUE(obu_->frame_header().allow_warped_motion);
+}
+
+TEST_F(ObuParserTest, GlobalMotionParameters) {
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+ std::array<GlobalMotion, kNumReferenceFrameTypes> gold;
+ for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+ gold[i].type = kGlobalMotionTransformationTypeIdentity;
+ for (int j = 0; j < 6; ++j) {
+ gold[i].params[j] = (j % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+ }
+ }
+
+ ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameKey));
+ VerifyGlobalMotionParameters(gold);
+
+ data.Clear();
+ for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+ // is_global=1; is_rot_zoom=1; parameter_values;
+ data.AppendBytes(kDefaultGlobalMotionParametersRotZoom);
+
+ // Magic numbers based on kDefaultGlobalMotionParametersRotZoom.
+ gold[i].type = kGlobalMotionTransformationTypeRotZoom;
+ gold[i].params[0] = -73728;
+ gold[i].params[1] = -23552;
+ gold[i].params[2] = 65952;
+ gold[i].params[3] = -62;
+ gold[i].params[4] = 62;
+ gold[i].params[5] = 65952;
+ }
+
+ ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+ VerifyGlobalMotionParameters(gold);
+
+ data.Clear();
+ for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+ // This bit is not part of the hex string because it would make the whole
+ // string not align to 8 bits. Appending this separately so that we can keep
+ // the rest of them a magic hex string.
+ data.AppendBit(1); // is_global.
+ // is_rot_zoom=0; is_translation=0; parameter_values;
+ data.AppendBytes(kDefaultGlobalMotionParametersAffine);
+
+ // Magic numbers based on kDefaultGlobalMotionParametersAffine.
+ gold[i].type = kGlobalMotionTransformationTypeAffine;
+ gold[i].params[4] = -62;
+ }
+
+ ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+ VerifyGlobalMotionParameters(gold);
+}
+
+TEST_F(ObuParserTest, FilmGrainParameters) {
+ BytesAndBits data;
+ data.AppendBit(0); // dummy.
+
+ // Test film grain not present.
+ FilmGrainParams gold = {};
+ ObuSequenceHeader sequence_header = {};
+ sequence_header.film_grain_params_present = false;
+ ObuFrameHeader frame_header = {};
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ VerifyFilmGrainParameters(gold);
+
+ // Test if show_frame = false and showable_frame = false.
+ data.Clear();
+ gold = {};
+ sequence_header.film_grain_params_present = true;
+ frame_header.show_frame = false;
+ frame_header.showable_frame = false;
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ VerifyFilmGrainParameters(gold);
+
+ // Test if apply_grain = false.
+ data.Clear();
+ gold = {};
+ sequence_header.film_grain_params_present = true;
+ frame_header.show_frame = true;
+ frame_header.showable_frame = true;
+ data.AppendBit(0);
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ VerifyFilmGrainParameters(gold);
+
+ // Test if update_grain = false.
+ data.Clear();
+ gold = {};
+ sequence_header.film_grain_params_present = true;
+ frame_header.show_frame = true;
+ frame_header.showable_frame = true;
+ frame_header.frame_type = kFrameInter;
+ for (auto& index : frame_header.reference_frame_index) {
+ index = 1;
+ }
+ data.AppendBit(1);
+ gold.apply_grain = true;
+ data.AppendLiteral(16, 8);
+ gold.grain_seed = 8;
+ data.AppendBit(0);
+ gold.update_grain = false;
+ data.AppendLiteral(3, 1);
+ gold.reference_index = 1;
+ // Set up decoder_state_ with a previous frame containing saved film grain
+ // parameters.
+ decoder_state_.reference_frame[1] = buffer_pool_->GetFreeBuffer();
+ EXPECT_NE(decoder_state_.reference_frame[1], nullptr);
+ FilmGrainParams prev_grain_params = {};
+ prev_grain_params.apply_grain = true;
+ prev_grain_params.grain_seed = 11;
+ prev_grain_params.update_grain = true;
+ decoder_state_.reference_frame[1]->set_film_grain_params(prev_grain_params);
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ VerifyFilmGrainParameters(gold);
+
+ // Test if update_grain = true, is_monochrome = true;
+ data.Clear();
+ gold = {};
+ frame_header.frame_type = kFrameKey;
+ for (auto& index : frame_header.reference_frame_index) {
+ index = 0;
+ }
+ data.AppendBit(1);
+ gold.apply_grain = true;
+ data.AppendLiteral(16, 8);
+ gold.grain_seed = 8;
+ gold.update_grain = true;
+ data.AppendLiteral(4, 10);
+ gold.num_y_points = 10;
+ for (int i = 0; i < gold.num_y_points; ++i) {
+ data.AppendLiteral(8, 2 * i);
+ gold.point_y_value[i] = 2 * i;
+ data.AppendLiteral(8, i);
+ gold.point_y_scaling[i] = i;
+ }
+ sequence_header.color_config.is_monochrome = true;
+ gold.chroma_scaling_from_luma = false;
+ gold.num_u_points = 0;
+ gold.num_v_points = 0;
+ data.AppendLiteral(2, 3);
+ gold.chroma_scaling = 11;
+ data.AppendLiteral(2, 1);
+ gold.auto_regression_coeff_lag = 1;
+ const int num_pos_luma =
+ 2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+ for (int i = 0; i < num_pos_luma; ++i) {
+ data.AppendLiteral(8, i + 128);
+ gold.auto_regression_coeff_y[i] = i;
+ }
+ data.AppendLiteral(2, 0);
+ gold.auto_regression_shift = 6;
+ data.AppendLiteral(2, 1);
+ gold.grain_scale_shift = 1;
+ data.AppendBit(1);
+ gold.overlap_flag = true;
+ data.AppendBit(0);
+ gold.clip_to_restricted_range = false;
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ ASSERT_TRUE(
+ obu_->frame_header().frame_type == kFrameInter ||
+ obu_->frame_header().film_grain_params.update_grain); // a implies b.
+ VerifyFilmGrainParameters(gold);
+
+ // Test if update_grain = true, is_monochrome = false;
+ data.Clear();
+ gold = {};
+ frame_header.frame_type = kFrameKey;
+ data.AppendBit(1);
+ gold.apply_grain = true;
+ data.AppendLiteral(16, 8);
+ gold.grain_seed = 8;
+ gold.update_grain = true;
+ data.AppendLiteral(4, 10);
+ gold.num_y_points = 10;
+ for (int i = 0; i < gold.num_y_points; ++i) {
+ data.AppendLiteral(8, 2 * i);
+ gold.point_y_value[i] = 2 * i;
+ data.AppendLiteral(8, i);
+ gold.point_y_scaling[i] = i;
+ }
+ sequence_header.color_config.is_monochrome = false;
+ data.AppendBit(0);
+ gold.chroma_scaling_from_luma = false;
+ data.AppendLiteral(4, 5);
+ gold.num_u_points = 5;
+ for (int i = 0; i < gold.num_u_points; ++i) {
+ data.AppendLiteral(8, 2 * i + 1);
+ gold.point_u_value[i] = 2 * i + 1;
+ data.AppendLiteral(8, i);
+ gold.point_u_scaling[i] = i;
+ }
+ data.AppendLiteral(4, 3);
+ gold.num_v_points = 3;
+ for (int i = 0; i < gold.num_v_points; ++i) {
+ data.AppendLiteral(8, i);
+ gold.point_v_value[i] = i;
+ data.AppendLiteral(8, i + 1);
+ gold.point_v_scaling[i] = i + 1;
+ }
+ data.AppendLiteral(2, 3);
+ gold.chroma_scaling = 11;
+ data.AppendLiteral(2, 1);
+ gold.auto_regression_coeff_lag = 1;
+ const int num_pos_luma2 =
+ 2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+ for (int i = 0; i < num_pos_luma2; ++i) {
+ data.AppendLiteral(8, i + 128);
+ gold.auto_regression_coeff_y[i] = i;
+ }
+ for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+ data.AppendLiteral(8, i);
+ gold.auto_regression_coeff_u[i] = i - 128;
+ }
+ for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+ data.AppendLiteral(8, i);
+ gold.auto_regression_coeff_v[i] = i - 128;
+ }
+ data.AppendLiteral(2, 0);
+ gold.auto_regression_shift = 6;
+ data.AppendLiteral(2, 1);
+ gold.grain_scale_shift = 1;
+ data.AppendLiteral(8, 2);
+ gold.u_multiplier = -126;
+ data.AppendLiteral(8, 1);
+ gold.u_luma_multiplier = -127;
+ data.AppendLiteral(9, 3);
+ gold.u_offset = -253;
+ data.AppendLiteral(8, 3);
+ gold.v_multiplier = -125;
+ data.AppendLiteral(8, 2);
+ gold.v_luma_multiplier = -126;
+ data.AppendLiteral(9, 1);
+ gold.v_offset = -255;
+ data.AppendBit(1);
+ gold.overlap_flag = true;
+ data.AppendBit(0);
+ gold.clip_to_restricted_range = false;
+ ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+ frame_header));
+ ASSERT_TRUE(
+ obu_->frame_header().frame_type == kFrameInter ||
+ obu_->frame_header().film_grain_params.update_grain); // a implies b.
+ VerifyFilmGrainParameters(gold);
+}
+
+TEST_F(ObuParserTest, TileInfoSyntax) {
+ BytesAndBits data;
+ TileInfo gold;
+ memset(&gold, 0, sizeof(gold));
+
+ gold.uniform_spacing = true;
+ gold.tile_columns_log2 = 1;
+ gold.tile_columns = 2;
+ gold.tile_rows_log2 = 1;
+ gold.tile_rows = 2;
+ gold.tile_count = 4;
+ gold.tile_column_start[1] = 64;
+ gold.tile_column_start[2] = 88;
+ gold.tile_row_start[1] = 64;
+ gold.tile_row_start[2] = 72;
+ gold.context_update_id = 3;
+ gold.tile_size_bytes = 4;
+ data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+ data.AppendBit(1); // increment_tile_cols_log2.
+ data.AppendBit(0); // increment_tile_cols_log2.
+ data.AppendBit(1); // increment_tile_rows_log2.
+ data.AppendBit(0); // increment_tile_rows_log2.
+ data.AppendBit(1); // context update id, columns_log2+rows_log2 bits
+ data.AppendBit(1);
+ data.AppendLiteral(2, gold.tile_size_bytes - 1);
+
+ ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+ VerifyTileInfoParameters(gold);
+
+ gold.uniform_spacing = false;
+ gold.tile_column_width_in_superblocks[0] = 2;
+ gold.tile_column_width_in_superblocks[1] = 1;
+ gold.tile_row_height_in_superblocks[0] = 2;
+ gold.tile_row_height_in_superblocks[1] = 1;
+
+ data.SetBit(0, static_cast<uint8_t>(gold.uniform_spacing));
+ // The next 4 bits remain the same except now they represent f(w - 1) and
+ // extra_bit in DecodeUniform. All the subsequent bits are unchanged the
+ // represent the same thing as above.
+
+ ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+ VerifyTileInfoParameters(gold);
+
+ // No tiles.
+ memset(&gold, 0, sizeof(gold));
+ gold.uniform_spacing = true;
+ gold.tile_columns = 1;
+ gold.tile_rows = 1;
+ gold.tile_count = 1;
+ gold.tile_column_start[1] = 88;
+ gold.tile_row_start[1] = 72;
+ data.Clear();
+ data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+ data.AppendBit(0); // tile_cols_log2.
+ data.AppendBit(0); // tile_rows_log2.
+
+ ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+ VerifyTileInfoParameters(gold);
+
+ // 64x64 superblocks. No tiles.
+ gold.tile_column_start[1] = 640;
+ gold.tile_row_start[1] = 360;
+
+ ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 640, 360, false));
+ VerifyTileInfoParameters(gold);
+}
+
+TEST_F(ObuParserTest, MetadataUnknownType) {
+ BytesAndBits data;
+ // The metadata_type 10 is a user private value (6-31).
+ data.AppendLiteral(8, 10); // metadata_type.
+ // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU if
+ // they do not understand the metadata_type."
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataHdrCll) {
+ BytesAndBits data;
+ ObuMetadataHdrCll gold;
+ gold.max_cll = 25;
+ gold.max_fall = 100;
+
+ data.AppendLiteral(8, kMetadataTypeHdrContentLightLevel);
+ data.AppendLiteral(16, gold.max_cll);
+ data.AppendLiteral(16, gold.max_fall);
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+ VerifyMetadataHdrCll(gold);
+}
+
+TEST_F(ObuParserTest, MetadataHdrMdcv) {
+ BytesAndBits data;
+ ObuMetadataHdrMdcv gold;
+ for (int i = 0; i < 3; ++i) {
+ gold.primary_chromaticity_x[i] = 0;
+ gold.primary_chromaticity_y[i] = 0;
+ }
+ gold.white_point_chromaticity_x = 250;
+ gold.white_point_chromaticity_y = 2500;
+ gold.luminance_max = 6000;
+ gold.luminance_min = 3000;
+
+ data.AppendLiteral(8, kMetadataTypeHdrMasteringDisplayColorVolume);
+ for (int i = 0; i < 3; ++i) {
+ data.AppendLiteral(16, gold.primary_chromaticity_x[i]);
+ data.AppendLiteral(16, gold.primary_chromaticity_y[i]);
+ }
+ data.AppendLiteral(16, gold.white_point_chromaticity_x);
+ data.AppendLiteral(16, gold.white_point_chromaticity_y);
+ data.AppendLiteral(32, gold.luminance_max);
+ data.AppendLiteral(32, gold.luminance_min);
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+ VerifyMetadataHdrMdcv(gold);
+}
+
+TEST_F(ObuParserTest, MetadataScalability) {
+ BytesAndBits data;
+
+ data.AppendLiteral(8, kMetadataTypeScalability);
+ data.AppendLiteral(8, 0); // scalability_mode_idc
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataItutT35) {
+ BytesAndBits data;
+ ObuMetadataItutT35 gold;
+ gold.country_code = 0xA6; // 1 0 1 0 0 1 1 0 Switzerland
+ DynamicBuffer<uint8_t> payload_bytes;
+ ASSERT_TRUE(payload_bytes.Resize(10));
+ gold.payload_bytes = payload_bytes.get();
+ for (int i = 0; i < 10; ++i) {
+ gold.payload_bytes[i] = 9 - i;
+ }
+ gold.payload_size = 10;
+
+ data.AppendLiteral(8, kMetadataTypeItutT35);
+ data.AppendLiteral(8, gold.country_code);
+ for (int i = 0; i < 10; ++i) {
+ data.AppendLiteral(8, 9 - i);
+ }
+ // For the kMetadataTypeItutT35 metadata type, we must include the trailing
+ // bit so that the end of the itu_t_t35_payload_bytes can be identified.
+ data.AppendLiteral(8, 0x80);
+ data.AppendLiteral(8, 0x00);
+ data.AppendLiteral(8, 0x00);
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+ VerifyMetadataItutT35(gold);
+
+ gold.country_code = 0xFF;
+ gold.country_code_extension_byte = 10;
+
+ data.SetLiteral(8, 8, gold.country_code);
+ data.InsertLiteral(16, 8, gold.country_code_extension_byte);
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+ VerifyMetadataItutT35(gold);
+}
+
+TEST_F(ObuParserTest, MetadataTimecode) {
+ BytesAndBits data;
+
+ data.AppendLiteral(8, kMetadataTypeTimecode);
+ data.AppendLiteral(5, 0); // counting_type
+ data.AppendBit(1); // full_timestamp_flag
+ data.AppendBit(0); // discontinuity_flag
+ data.AppendBit(0); // cnt_dropped_flag
+ data.AppendLiteral(9, 8); // n_frames
+ data.AppendLiteral(6, 59); // seconds_value
+ data.AppendLiteral(6, 59); // minutes_value
+ data.AppendLiteral(5, 23); // hours_value
+ data.AppendLiteral(5, 0); // time_offset_length
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) {
+ BytesAndBits data;
+
+ data.AppendLiteral(8, kMetadataTypeTimecode);
+ data.AppendLiteral(5, 0); // counting_type
+ data.AppendBit(1); // full_timestamp_flag
+ data.AppendBit(0); // discontinuity_flag
+ data.AppendBit(0); // cnt_dropped_flag
+ data.AppendLiteral(9, 8); // n_frames
+ data.AppendLiteral(6, 60); // seconds_value
+ data.AppendLiteral(6, 59); // minutes_value
+ data.AppendLiteral(5, 23); // hours_value
+ data.AppendLiteral(5, 0); // time_offset_length
+
+ EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) {
+ BytesAndBits data;
+
+ data.AppendLiteral(8, kMetadataTypeTimecode);
+ data.AppendLiteral(5, 0); // counting_type
+ data.AppendBit(1); // full_timestamp_flag
+ data.AppendBit(0); // discontinuity_flag
+ data.AppendBit(0); // cnt_dropped_flag
+ data.AppendLiteral(9, 8); // n_frames
+ data.AppendLiteral(6, 59); // seconds_value
+ data.AppendLiteral(6, 60); // minutes_value
+ data.AppendLiteral(5, 23); // hours_value
+ data.AppendLiteral(5, 0); // time_offset_length
+
+ EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidHoursValue) {
+ BytesAndBits data;
+
+ data.AppendLiteral(8, kMetadataTypeTimecode);
+ data.AppendLiteral(5, 0); // counting_type
+ data.AppendBit(1); // full_timestamp_flag
+ data.AppendBit(0); // discontinuity_flag
+ data.AppendBit(0); // cnt_dropped_flag
+ data.AppendLiteral(9, 8); // n_frames
+ data.AppendLiteral(6, 59); // seconds_value
+ data.AppendLiteral(6, 59); // minutes_value
+ data.AppendLiteral(5, 24); // hours_value
+ data.AppendLiteral(5, 0); // time_offset_length
+
+ EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_POST_FILTER_H_
+#define LIBGAV1_SRC_POST_FILTER_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// This class applies in-loop filtering for each frame after it is
+// reconstructed. The in-loop filtering contains all post processing filtering
+// for the reconstructed frame, including deblock filter, CDEF, superres,
+// and loop restoration.
+// Historically, for example in libaom, loop filter refers to deblock filter.
+// To avoid name conflicts, we call this class PostFilter (post processing).
+// In-loop post filtering order is:
+// deblock --> CDEF --> super resolution--> loop restoration.
+// When CDEF and super resolution is not used, we can combine deblock
+// and restoration together to only filter frame buffer once.
+class PostFilter {
+ public:
+ // This class does not take ownership of the masks/restoration_info, but it
+ // may change their values.
+ //
+ // The overall flow of data in this class (for both single and multi-threaded
+ // cases) is as follows:
+ // -> Input: |frame_buffer_|.
+ // -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
+ // |loop_restoration_buffer_|.
+ // -> Deblocking:
+ // * Input: |source_buffer_|
+ // * Output: |source_buffer_|
+ // -> CDEF:
+ // * Input: |source_buffer_|
+ // * Output: |cdef_buffer_|
+ // -> SuperRes:
+ // * Input: |cdef_buffer_|
+ // * Output: |superres_buffer_|
+ // -> Loop Restoration:
+ // * Input: |superres_buffer_|
+ // * Output: |loop_restoration_buffer_|.
+ // -> Now |frame_buffer_| contains the filtered frame.
+ PostFilter(const ObuFrameHeader& frame_header,
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+ const dsp::Dsp* dsp, int do_post_filter_mask);
+
+ // non copyable/movable.
+ PostFilter(const PostFilter&) = delete;
+ PostFilter& operator=(const PostFilter&) = delete;
+ PostFilter(PostFilter&&) = delete;
+ PostFilter& operator=(PostFilter&&) = delete;
+
+ // The overall function that applies all post processing filtering with
+ // multiple threads.
+ // * The filtering order is:
+ // deblock --> CDEF --> super resolution--> loop restoration.
+ // * The output of each filter is the input for the following filter. A
+ // special case is that loop restoration needs a few rows of the deblocked
+ // frame and the entire cdef filtered frame:
+ // deblock --> CDEF --> super resolution --> loop restoration.
+ // | ^
+ // | |
+ // -----------> super resolution -----
+ // * Any of these filters could be present or absent.
+ // * |frame_buffer_| points to the decoded frame buffer. When
+ // ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each
+ // of the filters as described below.
+ // Filter behavior (multi-threaded):
+ // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+ // If cdef and loop restoration are both on, then 4 rows (as
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
+ // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+ // the input and the output is written into |cdef_buffer_| (which is
+ // the same as |source_buffer_|).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+ // |superres_line_buffer_| as the input and the output is written
+ // into |superres_buffer_| (which is just |cdef_buffer_| with a
+ // shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left).
+ void ApplyFilteringThreaded();
+
+ // Does the overall post processing filter for one superblock row starting at
+ // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+ // will not be applied.
+ //
+ // Filter behavior (single-threaded):
+ // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+ // If cdef and loop restoration are both on, then 4 rows (as
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
+ // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
+ // (which is just |source_buffer_| with a shift to the top-left).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+ // and the output is written into |superres_buffer_| (which is
+ // just |cdef_buffer_| with a shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left or top-left).
+ // Returns the index of the last row whose post processing is complete and can
+ // be used for referencing.
+ int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+ bool do_deblock);
+
+ // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+ // for the superblock row starting at |row4x4_start| for columns starting from
+ // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+ // until the smallest multiple of 16 that is >= |column4x4_end| or until
+ // |frame_header_.columns4x4|, whichever is lower. This function must be
+ // called only if |DoDeblock()| returns true.
+ void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+ int column4x4_start, int column4x4_end, int sb4x4);
+
+ static bool DoCdef(const ObuFrameHeader& frame_header,
+ int do_post_filter_mask) {
+ return (frame_header.cdef.bits > 0 ||
+ frame_header.cdef.y_primary_strength[0] > 0 ||
+ frame_header.cdef.y_secondary_strength[0] > 0 ||
+ frame_header.cdef.uv_primary_strength[0] > 0 ||
+ frame_header.cdef.uv_secondary_strength[0] > 0) &&
+ (do_post_filter_mask & 0x02) != 0;
+ }
+ bool DoCdef() const { return do_cdef_; }
+ // If filter levels for Y plane (0 for vertical, 1 for horizontal),
+ // are all zero, deblock filter will not be applied.
+ static bool DoDeblock(const ObuFrameHeader& frame_header,
+ uint8_t do_post_filter_mask) {
+ return (frame_header.loop_filter.level[0] > 0 ||
+ frame_header.loop_filter.level[1] > 0) &&
+ (do_post_filter_mask & 0x01) != 0;
+ }
+ bool DoDeblock() const { return do_deblock_; }
+
+ uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
+ ReferenceFrameType type,
+ int mode_id) const {
+ return deblock_filter_levels_[segment_id][level_index][type][mode_id];
+ }
+ // Computes the deblock filter levels using |delta_lf| and stores them in
+ // |deblock_filter_levels|.
+ void ComputeDeblockFilterLevels(
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2]) const;
+ // Returns true if loop restoration will be performed for the given parameters
+ // and mask.
+ static bool DoRestoration(const LoopRestoration& loop_restoration,
+ uint8_t do_post_filter_mask, int num_planes) {
+ if (num_planes == kMaxPlanesMonochrome) {
+ return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+ (do_post_filter_mask & 0x08) != 0;
+ }
+ return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+ loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+ (do_post_filter_mask & 0x08) != 0;
+ }
+ bool DoRestoration() const { return do_restoration_; }
+
+ // Returns a pointer to the unfiltered buffer. This is used by the Tile class
+ // to determine where to write the output of the tile decoding process taking
+ // in-place filtering offsets into consideration.
+ uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; }
+ const YuvBuffer& frame_buffer() const { return frame_buffer_; }
+
+ // Returns true if SuperRes will be performed for the given frame header and
+ // mask.
+ static bool DoSuperRes(const ObuFrameHeader& frame_header,
+ uint8_t do_post_filter_mask) {
+ return frame_header.width != frame_header.upscaled_width &&
+ (do_post_filter_mask & 0x04) != 0;
+ }
+ bool DoSuperRes() const { return do_superres_; }
+ LoopRestorationInfo* restoration_info() const { return restoration_info_; }
+ uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
+ int row, int column) const {
+ return base_buffer + (row >> subsampling_y_[plane]) * stride +
+ ((column >> subsampling_x_[plane]) << pixel_size_log2_);
+ }
+ uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+ uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+ uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+ }
+
+ template <typename Pixel>
+ static void ExtendFrame(Pixel* frame_start, int width, int height,
+ ptrdiff_t stride, int left, int right, int top,
+ int bottom);
+
+ private:
+ // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
+ // functions.
+ using DeblockFilter = void (PostFilter::*)(int row4x4_start, int row4x4_end,
+ int column4x4_start,
+ int column4x4_end);
+ // Functions common to all post filters.
+
+ // Extends the frame by setting the border pixel values to the one from its
+ // closest frame boundary.
+ void ExtendFrameBoundary(uint8_t* frame_start, int width, int height,
+ ptrdiff_t stride, int left, int right, int top,
+ int bottom) const;
+ // Extend frame boundary for referencing if the frame will be saved as a
+ // reference frame.
+ void ExtendBordersForReferenceFrame();
+ // Copies the deblocked pixels needed for loop restoration.
+ void CopyDeblockedPixels(Plane plane, int row4x4);
+ // Copies the border for one superblock row. If |for_loop_restoration| is
+ // true, then it assumes that the border extension is being performed for the
+ // input of the loop restoration process. If |for_loop_restoration| is false,
+ // then it assumes that the border extension is being performed for using the
+ // current frame as a reference frame. In this case, |progress_row_| is also
+ // updated.
+ void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool for_loop_restoration);
+ // Sets up the |loop_restoration_border_| for loop restoration.
+ // This is called when there is no CDEF filter. We copy rows from
+ // |superres_buffer_| and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start);
+ // This is called when there is CDEF filter. We copy rows from
+ // |source_buffer_|, apply superres and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
+ // Returns true if we can perform border extension in loop (i.e.) without
+ // waiting until the entire frame is decoded. If intra_block_copy is true, we
+ // do in-loop border extension only if the upscaled_width is the same as 4 *
+ // columns4x4. Otherwise, we cannot do in loop border extension since those
+ // pixels may be used by intra block copy.
+ bool DoBorderExtensionInLoop() const {
+ return !frame_header_.allow_intrabc ||
+ frame_header_.upscaled_width ==
+ MultiplyBy4(frame_header_.columns4x4);
+ }
+ template <typename Pixel>
+ void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
+ Pixel* dst, ptrdiff_t dst_stride) {
+ assert(height > 0);
+ do {
+ memcpy(dst, src, width * sizeof(Pixel));
+ src += src_stride;
+ dst += dst_stride;
+ } while (--height != 0);
+ }
+
+ // Worker function used for multi-threaded implementation of Deblocking, CDEF
+ // and Loop Restoration.
+ using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+ // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+ // thread and returns once all the jobs are completed.
+ void RunJobs(WorkerFunction worker);
+
+ // Functions for the Deblocking filter.
+
+ bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ uint8_t* level, int* step,
+ int* filter_length) const;
+ void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4,
+ uint8_t* level_u, uint8_t* level_v,
+ int* step,
+ int* filter_length) const;
+ bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level, int* step,
+ int* filter_length) const;
+ void GetVerticalDeblockFilterEdgeInfoUV(int column4x4,
+ BlockParameters* const* bp_ptr,
+ uint8_t* level_u, uint8_t* level_v,
+ int* step, int* filter_length) const;
+ void HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+ int column4x4_start, int column4x4_end);
+ void VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+ int column4x4_start, int column4x4_end);
+ // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
+ // signature.
+ static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
+ DeblockFilter>::value,
+ "");
+ static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter),
+ DeblockFilter>::value,
+ "");
+ // Worker function used for multi-threaded deblocking.
+ template <LoopFilterType loop_filter_type>
+ void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+ WorkerFunction>::value,
+ "");
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+ WorkerFunction>::value,
+ "");
+
+ // Functions for the cdef filter.
+
+ // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+ // implementation into |cdef_border_|.
+ void SetupCdefBorder(int row4x4);
+ // This function prepares the input source block for cdef filtering. The input
+ // source block contains a 12x12 block, with the inner 8x8 as the desired
+ // filter region. It pads the block if the 12x12 block includes out of frame
+ // pixels with a large value. This achieves the required behavior defined in
+ // section 5.11.52 of the spec.
+ template <typename Pixel>
+ void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+ int column4x4, uint16_t* cdef_source,
+ ptrdiff_t cdef_stride, bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns);
+ // Applies cdef for one 64x64 block.
+ template <typename Pixel>
+ void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
+ int block_height4x4, int row4x4_start,
+ int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]);
+ // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
+ // duplication.
+ void ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4);
+ // Applies CDEF filtering for the superblock row starting at |row4x4| with a
+ // height of 4*|sb4x4|.
+ void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
+ // Worker function used for multi-threaded CDEF.
+ void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+ WorkerFunction>::value,
+ "");
+
+ // Functions for the SuperRes filter.
+
+ // Applies super resolution for the |src| for |rows[plane]| rows of each
+ // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+ // be processed, the line buffer indicated by |line_buffer_row| will be used
+ // as the source. If |dst_is_loop_restoration_border| is true, then it means
+ // that the |dst| pointers come from |loop_restoration_border_| and the
+ // strides will be populated from that buffer.
+ void ApplySuperRes(
+ const std::array<uint8_t*, kMaxPlanes>& src,
+ const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border = false); // Section 7.16.
+ // Applies SuperRes for the superblock row starting at |row4x4| with a height
+ // of 4*|sb4x4|.
+ void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool is_last_row);
+ void ApplySuperResThreaded();
+
+ // Functions for the Loop Restoration filter.
+
+ // Notes about Loop Restoration:
+ // (1). Loop restoration processing unit size is default to 64x64.
+ // Only when the remaining filtering area is smaller than 64x64, the
+ // processing unit size is the actual area size.
+ // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y).
+ // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y
+ // plane. The unit size for chroma can be the same or half, depending on
+ // subsampling. If either subsampling_x or subsampling_y is one, unit size
+ // is halved on both x and y sides.
+ // All loop restoration units have the same size for one plane.
+ // One loop restoration unit could contain multiple processing units.
+ // But they share the same sets of loop restoration parameters.
+ // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The
+ // size of first row of loop restoration units and processing units is
+ // shrunk by the offset.
+ // (4). Loop restoration units wrap the bottom and the right of the frame,
+ // if the remaining area is small. The criteria is whether the number of
+ // remaining rows/columns is smaller than half of loop restoration unit
+ // size.
+ // For example, if the frame size is 140x140, loop restoration unit size is
+ // 128x128. The size of the first loop restoration unit is 128x(128-8) =
+ // 128 columns x 120 rows.
+ // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop
+ // restoration unit. Similarly, the remaining 12 columns will also be folded
+ // to current loop restoration unit. So, even frame size is 140x140,
+ // there's only one loop restoration unit. Suppose processing unit is 64x64,
+ // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
+ // respectively. The second row is 64x64, 64x64, 12x64.
+ // The third row is 64x20, 64x20, 12x20.
+
+ // |stride| is shared by |src_buffer| and |dst_buffer|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+ Plane plane, int plane_height,
+ int plane_width, int y, int unit_row,
+ int current_process_unit_height,
+ int plane_unit_size, Pixel* dst_buffer);
+ // Applies loop restoration for the superblock row starting at |row4x4_start|
+ // with a height of 4*|sb4x4|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+ // Helper function that calls the right variant of
+ // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+ void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+ // Worker function used for multithreaded Loop Restoration.
+ void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+ WorkerFunction>::value,
+ "");
+
+ // The lookup table for picking the deblock filter, according to deblock
+ // filter type.
+ const DeblockFilter deblock_filter_func_[2] = {
+ &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+ const ObuFrameHeader& frame_header_;
+ const LoopRestoration& loop_restoration_;
+ const dsp::Dsp& dsp_;
+ const int8_t bitdepth_;
+ const int8_t subsampling_x_[kMaxPlanes];
+ const int8_t subsampling_y_[kMaxPlanes];
+ const int8_t planes_;
+ const int pixel_size_log2_;
+ const uint8_t* const inner_thresh_;
+ const uint8_t* const outer_thresh_;
+ const bool needs_chroma_deblock_;
+ const bool do_cdef_;
+ const bool do_deblock_;
+ const bool do_restoration_;
+ const bool do_superres_;
+ // This stores the deblocking filter levels assuming that the delta is zero.
+ // This will be used by all superblocks whose delta is zero (without having to
+ // recompute them). The dimensions (in order) are: segment_id, level_index
+ // (based on plane and direction), reference_frame and mode_id.
+ uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2];
+ // Stores the SuperRes info for the frame.
+ struct {
+ int upscaled_width;
+ int initial_subpixel_x;
+ int step;
+ } super_res_info_[kMaxPlanes];
+ const Array2D<int8_t>& cdef_index_;
+ const Array2D<uint8_t>& cdef_skip_;
+ const Array2D<TransformSize>& inter_transform_sizes_;
+ LoopRestorationInfo* const restoration_info_;
+ uint8_t* const superres_coefficients_[kNumPlaneTypes];
+ // Line buffer used by multi-threaded ApplySuperRes().
+ // In the multi-threaded case, this buffer will store the last downscaled row
+ // input of each thread to avoid overwrites by the first upscaled row output
+ // of the thread below it.
+ YuvBuffer& superres_line_buffer_;
+ const BlockParametersHolder& block_parameters_;
+ // Frame buffer to hold cdef filtered frame.
+ YuvBuffer cdef_filtered_buffer_;
+ // Input frame buffer.
+ YuvBuffer& frame_buffer_;
+ // A view into |frame_buffer_| that points to the input and output of the
+ // deblocking process.
+ uint8_t* source_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the output of the CDEF filtered
+ // planes (to facilitate in-place CDEF filtering).
+ uint8_t* cdef_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the planes after the SuperRes
+ // filter is applied (to facilitate in-place SuperRes).
+ uint8_t* superres_buffer_[kMaxPlanes];
+ // A view into |frame_buffer_| that points to the output of the Loop Restored
+ // planes (to facilitate in-place Loop Restoration).
+ uint8_t* loop_restoration_buffer_[kMaxPlanes];
+ YuvBuffer& cdef_border_;
+ // Buffer used to store the border pixels that are necessary for loop
+ // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
+ // for every 32x32 for chroma with subsampling). The indices of the rows that
+ // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+ // this buffer are never populated and never used.
+ // This buffer is used only when both of the following conditions are true:
+ // (1). Loop Restoration is on.
+ // (2). Cdef is on, or multi-threading is enabled for post filter.
+ YuvBuffer& loop_restoration_border_;
+ ThreadPool* const thread_pool_;
+
+ // Tracks the progress of the post filters.
+ int progress_row_ = -1;
+
+ // A block buffer to hold the input that is converted to uint16_t before
+ // cdef filtering. Only used in single threaded case. Y plane is processed
+ // separately. U and V planes are processed together. So it is sufficient to
+ // have this buffer to accommodate 2 planes at a time.
+ uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterSuperResTest;
+
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterHelperFuncTest;
+};
+
+extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start,
+ int width, int height,
+ ptrdiff_t stride,
+ int left, int right,
+ int top, int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
+ int width, int height,
+ ptrdiff_t stride,
+ int left, int right,
+ int top, int bottom);
+#endif
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_POST_FILTER_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cassert>
+
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStep64x64 = 16; // =64/4.
+constexpr int kCdefSkip = 8;
+
+constexpr uint8_t kCdefUvDirection[2][2][8] = {
+ {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
+ {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
+template <typename Pixel>
+void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
+ bool is_frame_left, bool is_frame_right,
+ uint16_t* const dst, const Pixel* left_border = nullptr) {
+ if (sizeof(src[0]) == sizeof(dst[0])) {
+ if (is_frame_left) {
+ Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
+ } else if (left_border == nullptr) {
+ memcpy(dst - kCdefBorder, src - kCdefBorder,
+ kCdefBorder * sizeof(dst[0]));
+ } else {
+ memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
+ }
+ memcpy(dst, src, block_width * sizeof(dst[0]));
+ if (is_frame_right) {
+ Memset(dst + block_width, kCdefLargeValue,
+ unit_width + kCdefBorder - block_width);
+ } else {
+ memcpy(dst + block_width, src + block_width,
+ (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
+ }
+ return;
+ }
+ if (is_frame_left) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+ }
+ } else if (left_border == nullptr) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = src[x];
+ }
+ } else {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = left_border[x + kCdefBorder];
+ }
+ }
+ for (int x = 0; x < block_width; ++x) {
+ dst[x] = src[x];
+ }
+ for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
+ dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+ }
+}
+
+// GCC 13.x will report a false positive from the call to
+// ApplyCdefForOneSuperBlockRowHelper() with a nullptr in
+// ApplyCdefForOneSuperBlockRow(). The call to CopyPixels() in
+// ApplyCdefForOneUnit() is only made when thread_pool_ != nullptr and
+// border_columns[][] is a valid pointer.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width, int height, size_t pixel_size) {
+ assert(src != nullptr);
+ assert(dst != nullptr);
+ assert(height > 0);
+ int y = height;
+ do {
+ memcpy(dst, src, width * pixel_size);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--y != 0);
+}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+} // namespace
+
+void PostFilter::SetupCdefBorder(int row4x4) {
+ assert(row4x4 >= 0);
+ assert(DoCdef());
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels = SubsampledValue(
+ MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+ subsampling_y_[plane]);
+ for (int i = 0; i < 4; ++i) {
+ const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ if (absolute_row >= plane_height) break;
+ const uint8_t* src =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+ memcpy(dst, src, row_width);
+ }
+ } while (++plane < planes_);
+}
+
+template <typename Pixel>
+void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
+ int row4x4, int column4x4,
+ uint16_t* cdef_source, ptrdiff_t cdef_stride,
+ const bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns) {
+ assert(y_plane || planes_ == kMaxPlanes);
+ const int max_planes = y_plane ? 1 : kMaxPlanes;
+ const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+ const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+ const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+ const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
+ const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
+ const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+ const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+ // unit_width, unit_height are the same as block_width, block_height unless
+ // it reaches the frame boundary, where block_width < 64 or
+ // block_height < 64. unit_width, unit_height guarantee we build blocks on
+ // a multiple of 8.
+ const int unit_width = Align(block_width, 8 >> subsampling_x);
+ const int unit_height = Align(block_height, 8 >> subsampling_y);
+ const bool is_frame_left = column4x4 == 0;
+ const bool is_frame_right = start_x + block_width >= plane_width;
+ const bool is_frame_top = row4x4 == 0;
+ const bool is_frame_bottom = start_y + block_height >= plane_height;
+ const int y_offset = is_frame_top ? 0 : kCdefBorder;
+ const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
+
+ for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+ uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+ kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders;
+ const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+ const Pixel* src_buffer =
+ reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
+ (start_y - y_offset) * src_stride + start_x;
+ const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+ const Pixel* cdef_border =
+ (thread_pool_ == nullptr)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+ cdef_border_row_offset * cdef_border_stride + start_x;
+
+ // All the copying code will use negative indices for populating the left
+ // border. So the starting point is set to kCdefBorder.
+ cdef_src += kCdefBorder;
+
+ // Copy the top 2 rows as follows;
+ // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
+ if (is_frame_top) {
+ for (int y = 0; y < kCdefBorder; ++y) {
+ Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+ unit_width + 2 * kCdefBorder);
+ cdef_src += cdef_stride;
+ }
+ } else {
+ const Pixel* top_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int top_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+ for (int y = 0; y < kCdefBorder; ++y) {
+ CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ top_border += top_border_stride;
+ cdef_src += cdef_stride;
+ // We need to increment |src_buffer| and |cdef_border| in this loop to
+ // set them up for the subsequent loops below.
+ src_buffer += src_stride;
+ cdef_border += cdef_border_stride;
+ }
+ }
+
+ // Copy the body as follows;
+ // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+ // rows are copied from |src_buffer|.
+ // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+ // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
+ int y = block_height;
+ const int y_threshold =
+ (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+ const Pixel* left_border =
+ (thread_pool_ == nullptr || !use_border_columns)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(border_columns[plane]);
+ do {
+ CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src, left_border);
+ cdef_src += cdef_stride;
+ src_buffer += src_stride;
+ if (left_border != nullptr) left_border += kCdefBorder;
+ } while (--y != y_threshold);
+
+ if (y > 0) {
+ assert(y == kCdefBorder);
+ // |cdef_border| now points to the top 2 rows of the current block. For
+ // the next loop, we need it to point to the bottom 2 rows of the
+ // current block. So increment it by 2 rows.
+ cdef_border += MultiplyBy2(cdef_border_stride);
+ for (int i = 0; i < kCdefBorder; ++i) {
+ CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ cdef_src += cdef_stride;
+ cdef_border += cdef_border_stride;
+ }
+ }
+
+ // Copy the bottom 2 rows as follows;
+ // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
+ y = 0;
+ if (is_frame_bottom) {
+ do {
+ Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+ unit_width + 2 * kCdefBorder);
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
+ } else {
+ const Pixel* bottom_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int bottom_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+ do {
+ CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ bottom_border += bottom_border_stride;
+ cdef_src += cdef_stride;
+ } while (++y < kCdefBorder + unit_height - block_height);
+ }
+ }
+}
+
+template <typename Pixel>
+void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
+ const int block_width4x4,
+ const int block_height4x4,
+ const int row4x4_start,
+ const int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]) {
+ // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+ static constexpr int kStep = 8;
+ static constexpr int kStep4x4 = 2;
+
+ int cdef_buffer_row_base_stride[kMaxPlanes];
+ uint8_t* cdef_buffer_row_base[kMaxPlanes];
+ int src_buffer_row_base_stride[kMaxPlanes];
+ const uint8_t* src_buffer_row_base[kMaxPlanes];
+ const uint16_t* cdef_src_row_base[kMaxPlanes];
+ int cdef_src_row_base_stride[kMaxPlanes];
+ int column_step[kMaxPlanes];
+ assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
+ int plane = kPlaneY;
+ do {
+ cdef_buffer_row_base[plane] =
+ GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
+ cdef_buffer_row_base_stride[plane] =
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+ row4x4_start, column4x4_start);
+ src_buffer_row_base_stride[plane] =
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ cdef_src_row_base[plane] =
+ cdef_block +
+ static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders +
+ kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+ cdef_src_row_base_stride[plane] =
+ kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
+ column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
+ } while (++plane < planes_);
+
+ // |border_columns| contains two buffers. In each call to this function, we
+ // will use one of them as the "destination" for the current call. And the
+ // other one as the "source" for the current call (which would have been the
+ // "destination" of the previous call). We will use the src_index to populate
+ // the borders which were backed up in the previous call. We will use the
+ // dst_index to populate the borders to be used in the next call.
+ const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+ const int border_columns_dst_index = border_columns_src_index ^ 1;
+
+ if (index == -1) {
+ if (thread_pool_ == nullptr) {
+ int plane = kPlaneY;
+ do {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ } while (++plane < planes_);
+ }
+ use_border_columns[border_columns_dst_index][0] = false;
+ use_border_columns[border_columns_dst_index][1] = false;
+ return;
+ }
+
+ const bool is_frame_right =
+ MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ // Backup the last 2 columns for use in the next iteration.
+ use_border_columns[border_columns_dst_index][0] = true;
+ const uint8_t* src_line =
+ GetSourceBuffer(kPlaneY, row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ assert(border_columns != nullptr);
+ CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+ border_columns[border_columns_dst_index][kPlaneY],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4), sizeof(Pixel));
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, true,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][0]);
+
+ // Stored direction used during the u/v pass. If bit 3 is set, then block is
+ // a skip.
+ uint8_t direction_y[8 * 8];
+ int y_index = 0;
+
+ const uint8_t y_primary_strength =
+ frame_header_.cdef.y_primary_strength[index];
+ const uint8_t y_secondary_strength =
+ frame_header_.cdef.y_secondary_strength[index];
+ // y_strength_index is 0 for both primary and secondary strengths being
+ // non-zero, 1 for primary only, 2 for secondary only. This will be updated
+ // with y_primary_strength after variance is applied.
+ int y_strength_index = static_cast<int>(y_secondary_strength == 0);
+
+ const bool compute_direction_and_variance =
+ (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+ const uint8_t* skip_row =
+ &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
+ const int skip_stride = cdef_skip_.columns();
+ int row4x4 = row4x4_start;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+ const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+ const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
+ int column4x4 = column4x4_start;
+
+ if (*skip_row == 0) {
+ for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
+ direction_y[y_index] = kCdefSkip;
+ }
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
+ cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
+ sizeof(Pixel));
+ }
+ } else {
+ do {
+ const int block_width = kStep;
+ const int block_height = kStep;
+ const int cdef_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* const cdef_buffer = cdef_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
+ const int src_stride = frame_buffer_.stride(kPlaneY);
+ const uint8_t* const src_buffer = src_buffer_base;
+
+ const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
+ const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
+ if (skip) { // No cdef filtering.
+ direction_y[y_index] = kCdefSkip;
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ // Zero out residual skip flag.
+ direction_y[y_index] = 0;
+
+ int variance = 0;
+ if (compute_direction_and_variance) {
+ if (thread_pool_ == nullptr ||
+ row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+ dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+ &variance);
+ } else if (sizeof(Pixel) == 2) {
+ dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+ &direction_y[y_index], &variance);
+ } else {
+ // If we are in the last row4x4 for this unit, then the last two
+ // input rows have to come from |cdef_border_|. Since we already
+ // have |cdef_src| populated correctly, use that as the input
+ // for the direction process.
+ uint8_t direction_src[8][8];
+ const uint16_t* cdef_src_line = cdef_src;
+ for (auto& direction_src_line : direction_src) {
+ for (int i = 0; i < 8; ++i) {
+ direction_src_line[i] = cdef_src_line[i];
+ }
+ cdef_src_line += kCdefUnitSizeWithBorders;
+ }
+ dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+ &variance);
+ }
+ }
+ const int direction =
+ (y_primary_strength == 0) ? 0 : direction_y[y_index];
+ const int variance_strength =
+ ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
+ : 0;
+ const uint8_t primary_strength =
+ (variance != 0)
+ ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+ : 0;
+ if ((primary_strength | y_secondary_strength) == 0) {
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ const int strength_index =
+ y_strength_index |
+ (static_cast<int>(primary_strength == 0) << 1);
+ dsp_.cdef_filters[1][strength_index](
+ cdef_src, kCdefUnitSizeWithBorders, block_height,
+ primary_strength, y_secondary_strength,
+ frame_header_.cdef.damping, direction, cdef_buffer,
+ cdef_stride);
+ }
+ }
+ cdef_buffer_base += column_step[kPlaneY];
+ src_buffer_base += column_step[kPlaneY];
+ cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
+
+ column4x4 += kStep4x4;
+ y_index++;
+ } while (column4x4 < column4x4_start + block_width4x4);
+ }
+
+ cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+ src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+ cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
+ skip_row += skip_stride;
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
+
+ if (planes_ == kMaxPlanesMonochrome) {
+ return;
+ }
+
+ const uint8_t uv_primary_strength =
+ frame_header_.cdef.uv_primary_strength[index];
+ const uint8_t uv_secondary_strength =
+ frame_header_.cdef.uv_secondary_strength[index];
+
+ if ((uv_primary_strength | uv_secondary_strength) == 0) {
+ if (thread_pool_ == nullptr) {
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
+ }
+ use_border_columns[border_columns_dst_index][1] = false;
+ return;
+ }
+
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ use_border_columns[border_columns_dst_index][1] = true;
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ // Backup the last 2 columns for use in the next iteration.
+ const uint8_t* src_line =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ CopyPixels(src_line, frame_buffer_.stride(plane),
+ border_columns[border_columns_dst_index][plane],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, false,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][1]);
+
+ // uv_strength_index is 0 for both primary and secondary strengths being
+ // non-zero, 1 for primary only, 2 for secondary only.
+ const int uv_strength_index =
+ (static_cast<int>(uv_primary_strength == 0) << 1) |
+ static_cast<int>(uv_secondary_strength == 0);
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ const int8_t subsampling_x = subsampling_x_[plane];
+ const int8_t subsampling_y = subsampling_y_[plane];
+ const int block_width = kStep >> subsampling_x;
+ const int block_height = kStep >> subsampling_y;
+ int row4x4 = row4x4_start;
+
+ y_index = 0;
+ do {
+ uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+ const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+ const uint16_t* cdef_src_base = cdef_src_row_base[plane];
+ int column4x4 = column4x4_start;
+ do {
+ const int cdef_stride = frame_buffer_.stride(plane);
+ uint8_t* const cdef_buffer = cdef_buffer_base;
+ const int src_stride = frame_buffer_.stride(plane);
+ const uint8_t* const src_buffer = src_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
+ const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
+ int dual_cdef = 0;
+
+ if (skip) { // No cdef filtering.
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
+ } else {
+ // Make sure block pair is not out of bounds.
+ if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+ // Enable dual processing if subsampling_x is 1.
+ dual_cdef = subsampling_x;
+ }
+
+ int direction = (uv_primary_strength == 0)
+ ? 0
+ : kCdefUvDirection[subsampling_x][subsampling_y]
+ [direction_y[y_index]];
+
+ if (dual_cdef != 0) {
+ if (uv_primary_strength &&
+ direction_y[y_index] != direction_y[y_index + 1]) {
+ // Disable dual processing if the second block of the pair does
+ // not have the same direction.
+ dual_cdef = 0;
+ }
+
+ // Disable dual processing if the second block of the pair is a
+ // skip.
+ if (direction_y[y_index + 1] == kCdefSkip) {
+ dual_cdef = 0;
+ }
+ }
+
+ // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
+ const int width_index = dual_cdef | (subsampling_x ^ 1);
+ dsp_.cdef_filters[width_index][uv_strength_index](
+ cdef_src, kCdefUnitSizeWithBorders, block_height,
+ uv_primary_strength, uv_secondary_strength,
+ frame_header_.cdef.damping - 1, direction, cdef_buffer,
+ cdef_stride);
+ }
+ // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+ // so adjust the pointers and indexes for 2 blocks.
+ cdef_buffer_base += column_step[plane] << dual_cdef;
+ src_buffer_base += column_step[plane] << dual_cdef;
+ cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
+ column4x4 += kStep4x4 << dual_cdef;
+ y_index += 1 << dual_cdef;
+ } while (column4x4 < column4x4_start + block_width4x4);
+
+ cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+ src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+ cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
+ row4x4 += kStep4x4;
+ } while (row4x4 < row4x4_start + block_height4x4);
+ }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4) {
+ bool use_border_columns[2][2] = {};
+ const bool non_zero_index = frame_header_.cdef.bits > 0;
+ const int8_t* cdef_index =
+ non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
+ int column4x4 = 0;
+ do {
+ const int index = non_zero_index ? *cdef_index++ : 0;
+ const int block_width4x4 =
+ std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
+ } else // NOLINT
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ {
+ ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
+ }
+ column4x4 += kStep64x64;
+ } while (column4x4 < frame_header_.columns4x4);
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
+ bool is_last_row) {
+ assert(row4x4_start >= 0);
+ assert(DoCdef());
+ int row4x4 = row4x4_start;
+ const int row4x4_limit = row4x4_start + sb4x4;
+ do {
+ if (row4x4 >= frame_header_.rows4x4) return;
+
+ // Apply cdef for the last 8 rows of the previous superblock row.
+ // One exception: If the superblock size is 128x128 and is_last_row is true,
+ // then we simply apply cdef for the entire superblock row without any lag.
+ // In that case, apply cdef for the previous superblock row only during the
+ // first iteration (row4x4 == row4x4_start).
+ if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
+ assert(row4x4 >= 16);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
+ }
+
+ // Apply cdef for the current superblock row. If this is the last superblock
+ // row we apply cdef for all the rows, otherwise we leave out the last 8
+ // rows.
+ const int block_height4x4 =
+ std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+ const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
+ if (height4x4 > 0) {
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+ height4x4);
+ }
+ row4x4 += kStep64x64;
+ } while (row4x4 < row4x4_limit);
+}
+
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+ // Each border_column buffer has to store 64 rows and 2 columns for each
+ // plane. For 10bit, that is 64*2*2 = 256 bytes.
+ alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+ while ((row4x4 = row4x4_atomic->fetch_add(
+ kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
+ const int block_height4x4 =
+ std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+ block_height4x4);
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <atomic>
+
+#include "src/post_filter.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+ // |filter_length| must be a power of 2.
+ assert((filter_length & (filter_length - 1)) == 0);
+ // This code is the branch free equivalent of:
+ // if (filter_length == 4) return kLoopFilterSize4;
+ // if (filter_length == 8) return kLoopFilterSize8;
+ // return kLoopFilterSize14;
+ return static_cast<dsp::LoopFilterSize>(
+ MultiplyBy2(static_cast<int>(filter_length > 4)) +
+ static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+ // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+ // otherwise size is kLoopFilterSize6.
+ return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id,
+ uint8_t* const level) {
+ if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) {
+ return false;
+ }
+ *level = bp.deblock_filter_level[filter_id];
+ return true;
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+ const ObuFrameHeader& frame_header, int segment_id, int level_index,
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+ const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+ uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+ kMaxLoopFilterValue);
+ const auto feature = static_cast<SegmentFeature>(
+ kSegmentFeatureLoopFilterYVertical + level_index);
+ level =
+ Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+ 0, kMaxLoopFilterValue);
+ if (!frame_header.loop_filter.delta_enabled) {
+ static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+ memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+ return;
+ }
+ assert(frame_header.loop_filter.delta_enabled);
+ const int shift = level >> 5;
+ deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+ shift),
+ 0, kMaxLoopFilterValue);
+ // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+ // not have to be populated.
+ for (int reference_frame = kReferenceFrameIntra + 1;
+ reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+ for (int mode_id = 0; mode_id < 2; ++mode_id) {
+ deblock_filter_levels[reference_frame][mode_id] = Clip3(
+ level +
+ LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+ frame_header.loop_filter.mode_deltas[mode_id],
+ shift),
+ 0, kMaxLoopFilterValue);
+ }
+ }
+}
+
+} // namespace
+
+void PostFilter::ComputeDeblockFilterLevels(
+ const int8_t delta_lf[kFrameLfCount],
+ uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2]) const {
+ if (!DoDeblock()) return;
+ const int num_segments =
+ frame_header_.segmentation.enabled ? kMaxSegments : 1;
+ for (int segment_id = 0; segment_id < num_segments; ++segment_id) {
+ int level_index = 0;
+ for (; level_index < 2; ++level_index) {
+ ComputeDeblockFilterLevelsHelper(
+ frame_header_, segment_id, level_index, delta_lf,
+ deblock_filter_levels[segment_id][level_index]);
+ }
+ for (; level_index < kFrameLfCount; ++level_index) {
+ if (frame_header_.loop_filter.level[level_index] != 0) {
+ ComputeDeblockFilterLevelsHelper(
+ frame_header_, segment_id, level_index, delta_lf,
+ deblock_filter_levels[segment_id][level_index]);
+ }
+ }
+ }
+}
+
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+ uint8_t* level, int* step,
+ int* filter_length) const {
+ *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]];
+ if (row4x4 == 0) return false;
+
+ const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+ const int row4x4_prev = row4x4 - 1;
+ assert(row4x4_prev >= 0);
+ const BlockParameters* bp_prev =
+ block_parameters_.Find(row4x4_prev, column4x4);
+
+ if (bp == bp_prev) {
+ // Not a border.
+ if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false;
+ } else {
+ const uint8_t level_this = bp->deblock_filter_level[1];
+ *level = level_this;
+ if (level_this == 0) {
+ const uint8_t level_prev = bp_prev->deblock_filter_level[1];
+ if (level_prev == 0) return false;
+ *level = level_prev;
+ }
+ }
+ const int step_prev =
+ kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]];
+ *filter_length = std::min(*step, step_prev);
+ return true;
+}
+
+void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV(
+ int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step,
+ int* filter_length) const {
+ const int subsampling_x = subsampling_x_[kPlaneU];
+ const int subsampling_y = subsampling_y_[kPlaneU];
+ row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+ column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+ const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+ *level_u = 0;
+ *level_v = 0;
+ *step = kTransformHeight[bp->uv_transform_size];
+ if (row4x4 == subsampling_y) {
+ return;
+ }
+
+ bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+ bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+ assert(need_filter_u || need_filter_v);
+ const int filter_id_u =
+ kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal];
+ const int filter_id_v =
+ kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal];
+ const int row4x4_prev = row4x4 - (1 << subsampling_y);
+ assert(row4x4_prev >= 0);
+ const BlockParameters* bp_prev =
+ block_parameters_.Find(row4x4_prev, column4x4);
+
+ if (bp == bp_prev) {
+ // Not a border.
+ const bool skip = bp->skip && bp->is_inter;
+ need_filter_u =
+ need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+ need_filter_v =
+ need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+ if (!need_filter_u && !need_filter_v) return;
+ if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+ if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+ *filter_length = *step;
+ return;
+ }
+
+ // It is a border.
+ if (need_filter_u) {
+ const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+ *level_u = level_u_this;
+ if (level_u_this == 0) {
+ *level_u = bp_prev->deblock_filter_level[filter_id_u];
+ }
+ }
+ if (need_filter_v) {
+ const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+ *level_v = level_v_this;
+ if (level_v_this == 0) {
+ *level_v = bp_prev->deblock_filter_level[filter_id_v];
+ }
+ }
+ const int step_prev = kTransformHeight[bp_prev->uv_transform_size];
+ *filter_length = std::min(*step, step_prev);
+}
+
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+ int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level,
+ int* step, int* filter_length) const {
+ const BlockParameters* bp = *bp_ptr;
+ *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+ if (column4x4 == 0) return false;
+
+ const int filter_id = 0;
+ const int column4x4_prev = column4x4 - 1;
+ assert(column4x4_prev >= 0);
+ const BlockParameters* bp_prev = *(bp_ptr - 1);
+ if (bp == bp_prev) {
+ // Not a border.
+ if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false;
+ } else {
+ // It is a border.
+ const uint8_t level_this = bp->deblock_filter_level[filter_id];
+ *level = level_this;
+ if (level_this == 0) {
+ const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+ if (level_prev == 0) return false;
+ *level = level_prev;
+ }
+ }
+ const int step_prev =
+ kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+ *filter_length = std::min(*step, step_prev);
+ return true;
+}
+
+void PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+ int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u,
+ uint8_t* level_v, int* step, int* filter_length) const {
+ const int subsampling_x = subsampling_x_[kPlaneU];
+ column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+ const BlockParameters* bp = *bp_ptr;
+ *level_u = 0;
+ *level_v = 0;
+ *step = kTransformWidth[bp->uv_transform_size];
+ if (column4x4 == subsampling_x) {
+ return;
+ }
+
+ bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+ bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+ assert(need_filter_u || need_filter_v);
+ const int filter_id_u =
+ kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
+ const int filter_id_v =
+ kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
+ const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
+
+ if (bp == bp_prev) {
+ // Not a border.
+ const bool skip = bp->skip && bp->is_inter;
+ need_filter_u =
+ need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+ need_filter_v =
+ need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+ if (!need_filter_u && !need_filter_v) return;
+ if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+ if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+ *filter_length = *step;
+ return;
+ }
+
+ // It is a border.
+ if (need_filter_u) {
+ const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+ *level_u = level_u_this;
+ if (level_u_this == 0) {
+ *level_u = bp_prev->deblock_filter_level[filter_id_u];
+ }
+ }
+ if (need_filter_v) {
+ const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+ *level_v = level_v_this;
+ if (level_v_this == 0) {
+ *level_v = bp_prev->deblock_filter_level[filter_id_v];
+ }
+ }
+ const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+ *filter_length = std::min(*step, step_prev);
+}
+
+void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+ int column4x4_start,
+ int column4x4_end) {
+ const int height4x4 = row4x4_end - row4x4_start;
+ const int width4x4 = column4x4_end - column4x4_start;
+ if (height4x4 <= 0 || width4x4 <= 0) return;
+
+ const int column_step = 1;
+ const int src_step = 4 << pixel_size_log2_;
+ const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+ int row_step;
+ uint8_t level;
+ int filter_length;
+
+ const int width = frame_header_.width;
+ const int height = frame_header_.height;
+ for (int column4x4 = 0;
+ column4x4 < width4x4 && MultiplyBy4(column4x4_start + column4x4) < width;
+ column4x4 += column_step, src += src_step) {
+ uint8_t* src_row = src;
+ for (int row4x4 = 0;
+ row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+ row4x4 += row_step) {
+ const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+ row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step,
+ &filter_length);
+ if (need_filter) {
+ assert(level > 0 && level <= kMaxLoopFilterValue);
+ const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+ HevThresh(level));
+ }
+ src_row += row_step * src_stride;
+ row_step = DivideBy4(row_step);
+ }
+ }
+
+ if (needs_chroma_deblock_) {
+ const int8_t subsampling_x = subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = subsampling_y_[kPlaneU];
+ const int column_step = 1 << subsampling_x;
+ const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+ const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+ uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+ uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+ int row_step;
+ uint8_t level_u;
+ uint8_t level_v;
+ int filter_length;
+
+ for (int column4x4 = 0; column4x4 < width4x4 &&
+ MultiplyBy4(column4x4_start + column4x4) < width;
+ column4x4 += column_step, src_u += src_step, src_v += src_step) {
+ uint8_t* src_row_u = src_u;
+ uint8_t* src_row_v = src_v;
+ for (int row4x4 = 0;
+ row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+ row4x4 += row_step) {
+ GetHorizontalDeblockFilterEdgeInfoUV(
+ row4x4_start + row4x4, column4x4_start + column4x4, &level_u,
+ &level_v, &row_step, &filter_length);
+ if (level_u != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row_u, src_stride_u, outer_thresh_[level_u],
+ inner_thresh_[level_u], HevThresh(level_u));
+ }
+ if (level_v != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+ src_row_v, src_stride_v, outer_thresh_[level_v],
+ inner_thresh_[level_v], HevThresh(level_v));
+ }
+ src_row_u += row_step * src_stride_u;
+ src_row_v += row_step * src_stride_v;
+ row_step = DivideBy4(row_step << subsampling_y);
+ }
+ }
+ }
+}
+
+void PostFilter::VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+ int column4x4_start, int column4x4_end) {
+ const int height4x4 = row4x4_end - row4x4_start;
+ const int width4x4 = column4x4_end - column4x4_start;
+ if (height4x4 <= 0 || width4x4 <= 0) return;
+
+ const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY));
+ const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+ uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+ int column_step;
+ uint8_t level;
+ int filter_length;
+
+ BlockParameters* const* bp_row_base =
+ block_parameters_.Address(row4x4_start, column4x4_start);
+ const int bp_stride = block_parameters_.columns4x4();
+ const int column_step_shift = pixel_size_log2_;
+ const int width = frame_header_.width;
+ const int height = frame_header_.height;
+ for (int row4x4 = 0;
+ row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+ ++row4x4, src += row_stride, bp_row_base += bp_stride) {
+ uint8_t* src_row = src;
+ BlockParameters* const* bp = bp_row_base;
+ for (int column4x4 = 0; column4x4 < width4x4 &&
+ MultiplyBy4(column4x4_start + column4x4) < width;
+ column4x4 += column_step, bp += column_step) {
+ const bool need_filter = GetVerticalDeblockFilterEdgeInfo(
+ row4x4_start + row4x4, column4x4_start + column4x4, bp, &level,
+ &column_step, &filter_length);
+ if (need_filter) {
+ assert(level > 0 && level <= kMaxLoopFilterValue);
+ const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+ dsp_.loop_filters[size][kLoopFilterTypeVertical](
+ src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+ HevThresh(level));
+ }
+ src_row += column_step << column_step_shift;
+ column_step = DivideBy4(column_step);
+ }
+ }
+
+ if (needs_chroma_deblock_) {
+ const int8_t subsampling_x = subsampling_x_[kPlaneU];
+ const int8_t subsampling_y = subsampling_y_[kPlaneU];
+ const int row_step = 1 << subsampling_y;
+ uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+ uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+ const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+ const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+ const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU));
+ const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV));
+ const LoopFilterType type = kLoopFilterTypeVertical;
+ int column_step;
+ uint8_t level_u;
+ uint8_t level_v;
+ int filter_length;
+
+ BlockParameters* const* bp_row_base = block_parameters_.Address(
+ GetDeblockPosition(row4x4_start, subsampling_y),
+ GetDeblockPosition(column4x4_start, subsampling_x));
+ const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
+ for (int row4x4 = 0;
+ row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+ row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
+ bp_row_base += bp_stride) {
+ uint8_t* src_row_u = src_u;
+ uint8_t* src_row_v = src_v;
+ BlockParameters* const* bp = bp_row_base;
+ for (int column4x4 = 0; column4x4 < width4x4 &&
+ MultiplyBy4(column4x4_start + column4x4) < width;
+ column4x4 += column_step, bp += column_step) {
+ GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp,
+ &level_u, &level_v, &column_step,
+ &filter_length);
+ if (level_u != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][type](
+ src_row_u, src_stride_u, outer_thresh_[level_u],
+ inner_thresh_[level_u], HevThresh(level_u));
+ }
+ if (level_v != 0) {
+ const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+ dsp_.loop_filters[size][type](
+ src_row_v, src_stride_v, outer_thresh_[level_v],
+ inner_thresh_[level_v], HevThresh(level_v));
+ }
+ src_row_u += column_step << column_step_shift;
+ src_row_v += column_step << column_step_shift;
+ column_step = DivideBy4(column_step << subsampling_x);
+ }
+ }
+ }
+}
+
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+ const int rows4x4 = frame_header_.rows4x4;
+ const int columns4x4 = frame_header_.columns4x4;
+ int row4x4;
+ while ((row4x4 = row4x4_atomic->fetch_add(
+ kNum4x4InLoopFilterUnit, std::memory_order_relaxed)) < rows4x4) {
+ (this->*deblock_filter_func_[loop_filter_type])(
+ row4x4, row4x4 + kNum4x4InLoopFilterUnit, 0, columns4x4);
+ }
+}
+
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+ std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+ std::atomic<int>* row4x4_atomic);
+
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+ int row4x4_start, int column4x4_start,
+ int column4x4_end, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoDeblock());
+ column4x4_end =
+ std::min(Align(column4x4_end, static_cast<int>(kNum4x4InLoopFilterUnit)),
+ frame_header_.columns4x4);
+ if (column4x4_start >= column4x4_end) return;
+ (this->*deblock_filter_func_[loop_filter_type])(
+ row4x4_start, row4x4_start + sb4x4, column4x4_start, column4x4_end);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+ {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+ {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+ {5, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40,
+ 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118,
+ 121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+ 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
+ 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114,
+ 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61,
+ 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87,
+ 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+ 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34,
+ 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60,
+ 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86,
+ 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112,
+ 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+ {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 33,
+ 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59,
+ 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85,
+ 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111,
+ 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58,
+ 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
+ 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110,
+ 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57,
+ 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83,
+ 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109,
+ 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+ {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56,
+ 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82,
+ 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneRow(
+ const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+ const int plane_height, const int plane_width, const int unit_y,
+ const int unit_row, const int current_process_unit_height,
+ const int plane_unit_size, Pixel* dst_buffer) {
+ const int num_horizontal_units =
+ restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
+ const RestorationUnitInfo* const restoration_info =
+ restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
+ unit_row * num_horizontal_units);
+ const bool in_place = DoCdef() || thread_pool_ != nullptr;
+ const Pixel* border = nullptr;
+ ptrdiff_t border_stride = 0;
+ src_buffer += unit_y * stride;
+ if (in_place) {
+ const int border_unit_y = std::max(
+ RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+ border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
+ border =
+ reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+ border_unit_y * border_stride;
+ }
+ int unit_column = 0;
+ int column = 0;
+ do {
+ const int current_process_unit_width =
+ std::min(plane_unit_size, plane_width - column);
+ const Pixel* src = src_buffer + column;
+ unit_column = std::min(unit_column, num_horizontal_units - 1);
+ if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
+ Pixel* dst = dst_buffer + column;
+ if (in_place) {
+ int k = current_process_unit_height;
+ do {
+ memmove(dst, src, current_process_unit_width * sizeof(Pixel));
+ src += stride;
+ dst += stride;
+ } while (--k != 0);
+ } else {
+ CopyPlane(src, stride, current_process_unit_width,
+ current_process_unit_height, dst, stride);
+ }
+ } else {
+ const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+ ptrdiff_t top_border_stride = stride;
+ const Pixel* bottom_border = src + current_process_unit_height * stride;
+ ptrdiff_t bottom_border_stride = stride;
+ const bool frame_bottom_border =
+ (unit_y + current_process_unit_height >= plane_height);
+ if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+ const Pixel* loop_restoration_border = border + column;
+ if (unit_y != 0) {
+ top_border = loop_restoration_border;
+ top_border_stride = border_stride;
+ loop_restoration_border += 4 * border_stride;
+ }
+ if (!frame_bottom_border) {
+ bottom_border = loop_restoration_border +
+ kRestorationVerticalBorder * border_stride;
+ bottom_border_stride = border_stride;
+ }
+ }
+#if LIBGAV1_MSAN
+ // The optimized loop filter may read past initialized values within the
+ // buffer.
+ RestorationBuffer restoration_buffer = {};
+#else
+ RestorationBuffer restoration_buffer;
+#endif
+ const LoopRestorationType type = restoration_info[unit_column].type;
+ assert(type == kLoopRestorationTypeSgrProj ||
+ type == kLoopRestorationTypeWiener);
+ const dsp::LoopRestorationFunc restoration_func =
+ dsp_.loop_restorations[type - 2];
+ restoration_func(restoration_info[unit_column], src, stride, top_border,
+ top_border_stride, bottom_border, bottom_border_stride,
+ current_process_unit_width, current_process_unit_height,
+ &restoration_buffer, dst_buffer + column);
+ }
+ ++unit_column;
+ column += plane_unit_size;
+ } while (column < plane_width);
+}
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+ const int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoRestoration());
+ int plane = kPlaneY;
+ const int upscaled_width = frame_header_.upscaled_width;
+ const int height = frame_header_.height;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+ const int unit_height_offset =
+ kRestorationUnitOffset >> subsampling_y_[plane];
+ const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+ const int plane_width =
+ SubsampledValue(upscaled_width, subsampling_x_[plane]);
+ const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
+ const int plane_process_unit_height =
+ kRestorationUnitHeight >> subsampling_y_[plane];
+ int y = (row4x4_start == 0)
+ ? 0
+ : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+ unit_height_offset;
+ int expected_height = plane_process_unit_height -
+ ((row4x4_start == 0) ? unit_height_offset : 0);
+ int current_process_unit_height;
+ for (int sb_y = 0; sb_y < sb4x4;
+ sb_y += 16, y += current_process_unit_height) {
+ if (y >= plane_height) break;
+ const int unit_row = std::min(
+ (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+ restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
+ current_process_unit_height = std::min(expected_height, plane_height - y);
+ expected_height = plane_process_unit_height;
+ ApplyLoopRestorationForOneRow<Pixel>(
+ reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+ static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+ current_process_unit_height, plane_unit_size,
+ reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
+ y * stride);
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
+ return;
+ }
+#endif
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
+}
+
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+ // subsampling) and hence we need to make sure to cover the last 8 rows of the
+ // last superblock row. So we run this loop for an extra iteration to
+ // accomplish that.
+ const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+ while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+ std::memory_order_relaxed)) <
+ row4x4_end) {
+ CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+ /*for_loop_restoration=*/true);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ continue;
+ }
+#endif
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
+
+} // namespace
+
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+ const ObuSequenceHeader& sequence_header,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+ int do_post_filter_mask)
+ : frame_header_(frame_header),
+ loop_restoration_(frame_header.loop_restoration),
+ dsp_(*dsp),
+ bitdepth_(sequence_header.color_config.bitdepth),
+ subsampling_x_{0, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_x},
+ subsampling_y_{0, sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.subsampling_y},
+ planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
+ : kMaxPlanes),
+ pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+ : sizeof(uint16_t)) -
+ 1),
+ inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+ outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+ needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
+ frame_header.loop_filter.level[kPlaneV + 1] != 0),
+ do_cdef_(DoCdef(frame_header, do_post_filter_mask)),
+ do_deblock_(DoDeblock(frame_header, do_post_filter_mask)),
+ do_restoration_(
+ DoRestoration(loop_restoration_, do_post_filter_mask, planes_)),
+ do_superres_(DoSuperRes(frame_header, do_post_filter_mask)),
+ cdef_index_(frame_scratch_buffer->cdef_index),
+ cdef_skip_(frame_scratch_buffer->cdef_skip),
+ inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+ restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+ superres_coefficients_{
+ frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+ frame_scratch_buffer
+ ->superres_coefficients
+ [(sequence_header.color_config.is_monochrome ||
+ sequence_header.color_config.subsampling_x == 0)
+ ? kPlaneTypeY
+ : kPlaneTypeUV]
+ .get()},
+ superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
+ block_parameters_(frame_scratch_buffer->block_parameters_holder),
+ frame_buffer_(*frame_buffer),
+ cdef_border_(frame_scratch_buffer->cdef_border),
+ loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
+ thread_pool_(
+ frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
+ const int8_t zero_delta_lf[kFrameLfCount] = {};
+ ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
+ if (DoSuperRes()) {
+ int plane = kPlaneY;
+ const int width = frame_header_.width;
+ const int upscaled_width_fh = frame_header_.upscaled_width;
+ do {
+ const int downscaled_width =
+ SubsampledValue(width, subsampling_x_[plane]);
+ const int upscaled_width =
+ SubsampledValue(upscaled_width_fh, subsampling_x_[plane]);
+ const int superres_width = downscaled_width << kSuperResScaleBits;
+ super_res_info_[plane].step =
+ (superres_width + upscaled_width / 2) / upscaled_width;
+ const int error =
+ super_res_info_[plane].step * upscaled_width - superres_width;
+ super_res_info_[plane].initial_subpixel_x =
+ ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
+ DivideBy2(upscaled_width)) /
+ upscaled_width +
+ (1 << (kSuperResExtraBits - 1)) - error / 2) &
+ kSuperResScaleMask;
+ super_res_info_[plane].upscaled_width = upscaled_width;
+ } while (++plane < planes_);
+ if (dsp->super_res_coefficients != nullptr) {
+ int plane = kPlaneY;
+ const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+ superres_coefficients_[kPlaneTypeUV])
+ ? kMaxPlanesMonochrome
+ : static_cast<int>(kNumPlaneTypes);
+ do {
+ dsp->super_res_coefficients(super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step,
+ superres_coefficients_[plane]);
+ } while (++plane < number_loops);
+ }
+ }
+ int plane = kPlaneY;
+ do {
+ loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
+ cdef_buffer_[plane] = frame_buffer_.data(plane);
+ superres_buffer_[plane] = frame_buffer_.data(plane);
+ source_buffer_[plane] = frame_buffer_.data(plane);
+ } while (++plane < planes_);
+ if (DoCdef() || DoRestoration() || DoSuperRes()) {
+ plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
+ int horizontal_shift = 0;
+ int vertical_shift = 0;
+ if (DoRestoration() &&
+ loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
+ horizontal_shift += frame_buffer_.alignment();
+ if (!DoCdef() && thread_pool_ == nullptr) {
+ vertical_shift += kRestorationVerticalBorder;
+ }
+ superres_buffer_[plane] +=
+ vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ }
+ if (DoSuperRes()) {
+ vertical_shift += kSuperResVerticalBorder;
+ }
+ cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ if (DoCdef() && thread_pool_ == nullptr) {
+ horizontal_shift += frame_buffer_.alignment();
+ vertical_shift += kCdefBorder;
+ }
+ assert(horizontal_shift <= frame_buffer_.right_border(plane));
+ assert(vertical_shift <= frame_buffer_.bottom_border(plane));
+ source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+ (horizontal_shift << pixel_size_log2);
+ } while (++plane < planes_);
+ }
+}
+
+// The following example illustrates how ExtendFrame() extends a frame.
+// Suppose the frame width is 8 and height is 4, and left, right, top, and
+// bottom are all equal to 3.
+//
+// Before:
+//
+// ABCDEFGH
+// IJKLMNOP
+// QRSTUVWX
+// YZabcdef
+//
+// After:
+//
+// AAA|ABCDEFGH|HHH [3]
+// AAA|ABCDEFGH|HHH
+// AAA|ABCDEFGH|HHH
+// ---+--------+---
+// AAA|ABCDEFGH|HHH [1]
+// III|IJKLMNOP|PPP
+// QQQ|QRSTUVWX|XXX
+// YYY|YZabcdef|fff
+// ---+--------+---
+// YYY|YZabcdef|fff [2]
+// YYY|YZabcdef|fff
+// YYY|YZabcdef|fff
+//
+// ExtendFrame() first extends the rows to the left and to the right[1]. Then
+// it copies the extended last row to the bottom borders[2]. Finally it copies
+// the extended first row to the top borders[3].
+// static
+template <typename Pixel>
+void PostFilter::ExtendFrame(Pixel* const frame_start, const int width,
+ const int height, const ptrdiff_t stride,
+ const int left, const int right, const int top,
+ const int bottom) {
+ Pixel* src = frame_start;
+ // Copy to left and right borders.
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src, width, left, right);
+ src += stride;
+ } while (--y != 0);
+ // Copy to bottom borders. For performance we copy |stride| pixels
+ // (including some padding pixels potentially) in each row, ending at the
+ // bottom right border pixel. In the diagram the asterisks indicate padding
+ // pixels.
+ //
+ // |<--- stride --->|
+ // **YYY|YZabcdef|fff <-- Copy from the extended last row.
+ // -----+--------+---
+ // **YYY|YZabcdef|fff
+ // **YYY|YZabcdef|fff
+ // **YYY|YZabcdef|fff <-- bottom right border pixel
+ assert(src == frame_start + height * stride);
+ Pixel* dst = src - left;
+ src = dst - stride;
+ for (int y = 0; y < bottom; ++y) {
+ memcpy(dst, src, sizeof(Pixel) * stride);
+ dst += stride;
+ }
+ // Copy to top borders. For performance we copy |stride| pixels (including
+ // some padding pixels potentially) in each row, starting from the top left
+ // border pixel. In the diagram the asterisks indicate padding pixels.
+ //
+ // +-- top left border pixel
+ // |
+ // v
+ // AAA|ABCDEFGH|HHH**
+ // AAA|ABCDEFGH|HHH**
+ // AAA|ABCDEFGH|HHH**
+ // ---+--------+-----
+ // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row.
+ // |<--- stride --->|
+ src = frame_start - left;
+ dst = frame_start - left - top * stride;
+ for (int y = 0; y < top; ++y) {
+ memcpy(dst, src, sizeof(Pixel) * stride);
+ dst += stride;
+ }
+}
+
+template void PostFilter::ExtendFrame<uint8_t>(uint8_t* const frame_start,
+ const int width,
+ const int height,
+ const ptrdiff_t stride,
+ const int left, const int right,
+ const int top, const int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void PostFilter::ExtendFrame<uint16_t>(
+ uint16_t* const frame_start, const int width, const int height,
+ const ptrdiff_t stride, const int left, const int right, const int top,
+ const int bottom);
+#endif
+
+void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
+ const int width, const int height,
+ const ptrdiff_t stride, const int left,
+ const int right, const int top,
+ const int bottom) const {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendFrame<uint16_t>(reinterpret_cast<uint16_t*>(frame_start), width,
+ height, stride >> 1, left, right, top, bottom);
+ return;
+ }
+#endif
+ ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
+ bottom);
+}
+
+void PostFilter::ExtendBordersForReferenceFrame() {
+ if (frame_header_.refresh_frame_flags == 0) return;
+ const int upscaled_width = frame_header_.upscaled_width;
+ const int height = frame_header_.height;
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ SubsampledValue(upscaled_width, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+ assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
+ frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
+ frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
+ frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels);
+ // plane subsampling_x_ left_border
+ // Y N/A 64, 48
+ // U,V 0 64, 48
+ // U,V 1 32, 16
+ assert(frame_buffer_.left_border(plane) >= 16);
+ // The |left| argument to ExtendFrameBoundary() must be at least
+ // kMinLeftBorderPixels (13) for warp.
+ static_assert(16 >= kMinLeftBorderPixels, "");
+ ExtendFrameBoundary(
+ frame_buffer_.data(plane), plane_width, plane_height,
+ frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
+ frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
+ frame_buffer_.bottom_border(plane));
+ } while (++plane < planes_);
+}
+
+void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
+ const int row_offset = DivideBy4(row4x4);
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
+ const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
+ subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ int last_valid_row = -1;
+ const int plane_height =
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+ int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ for (int i = 0; i < 4; ++i, ++row) {
+ if (absolute_row + i >= plane_height) {
+ if (last_valid_row == -1) break;
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ row = last_valid_row;
+ }
+ memcpy(dst, src + row * src_stride, row_width);
+ last_valid_row = row;
+ dst += dst_stride;
+ }
+}
+
+void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool for_loop_restoration) {
+ // Number of rows to be subtracted from the start position described by
+ // row4x4. We always lag by 8 rows (to account for in-loop post filters).
+ const int row_offset = (row4x4 == 0) ? 0 : 8;
+ // Number of rows to be subtracted from the height described by sb4x4.
+ const int height_offset = (row4x4 == 0) ? 8 : 0;
+ // If cdef is off and post filter multithreading is off, then loop restoration
+ // needs 2 extra rows for the bottom border in each plane.
+ const int extra_rows =
+ (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+ const int upscaled_width = frame_header_.upscaled_width;
+ const int height = frame_header_.height;
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ SubsampledValue(upscaled_width, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+ const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
+ assert(row >= 0);
+ if (row >= plane_height) break;
+ const int num_rows =
+ std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+ subsampling_y_[plane]) +
+ extra_rows,
+ plane_height - row);
+ // We only need to track the progress of the Y plane since the progress of
+ // the U and V planes will be inferred from the progress of the Y plane.
+ if (!for_loop_restoration && plane == kPlaneY) {
+ progress_row_ = row + num_rows;
+ }
+ const bool copy_bottom = row + num_rows == plane_height;
+ const ptrdiff_t stride = frame_buffer_.stride(plane);
+ uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane]
+ : frame_buffer_.data(plane)) +
+ row * stride;
+#if LIBGAV1_MSAN
+ const int right_padding =
+ (frame_buffer_.stride(plane) >> static_cast<int>(bitdepth_ > 8)) -
+ ((frame_buffer_.left_border(plane) + frame_buffer_.width(plane) +
+ frame_buffer_.right_border(plane)));
+ const int padded_right_border_size =
+ frame_buffer_.right_border(plane) + right_padding;
+ // The optimized loop restoration code may read into the next row's left
+ // border depending on the start of the last superblock and the size of the
+ // right border. This is safe as the post filter is applied after
+ // reconstruction is complete and the threaded implementations do not read
+ // from the left border.
+ const int left_border_overread =
+ (for_loop_restoration && padded_right_border_size < 64)
+ ? 63 - padded_right_border_size
+ : 0;
+ assert(!for_loop_restoration || left_border_overread == 0 ||
+ (frame_buffer_.bottom_border(plane) > 0 &&
+ left_border_overread <= frame_buffer_.left_border(plane)));
+ const int left_border = (for_loop_restoration && left_border_overread == 0)
+ ? kRestorationHorizontalBorder
+ : frame_buffer_.left_border(plane);
+ // The optimized loop restoration code will overread the visible frame
+ // buffer into the right border. Extend the right boundary further to
+ // prevent msan warnings.
+ const int right_border = for_loop_restoration
+ ? std::min(padded_right_border_size, 63)
+ : frame_buffer_.right_border(plane);
+#else
+ const int left_border = for_loop_restoration
+ ? kRestorationHorizontalBorder
+ : frame_buffer_.left_border(plane);
+ const int right_border = for_loop_restoration
+ ? kRestorationHorizontalBorder
+ : frame_buffer_.right_border(plane);
+#endif
+ const int top_border =
+ (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder
+ : frame_buffer_.top_border(plane))
+ : 0;
+ const int bottom_border =
+ copy_bottom
+ ? (for_loop_restoration ? kRestorationVerticalBorder
+ : frame_buffer_.bottom_border(plane))
+ : 0;
+ ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
+ right_border, top_border, bottom_border);
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+ assert(row4x4 >= 0);
+ assert(!DoCdef());
+ assert(DoRestoration());
+ const int upscaled_width = frame_header_.upscaled_width;
+ const int height = frame_header_.height;
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels =
+ SubsampledValue(upscaled_width, subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const uint8_t* src =
+ GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst =
+ loop_restoration_border_.data(plane) + row_offset * dst_stride;
+ for (int i = 0; i < 4; ++i) {
+ memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ if (absolute_row + i < plane_height - 1) src += src_stride;
+ dst += dst_stride;
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoCdef());
+ assert(DoRestoration());
+ for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+ const int row4x4 = row4x4_start + sb_y;
+ const int row_offset_start = DivideBy4(row4x4);
+ const std::array<uint8_t*, kMaxPlanes> dst = {
+ loop_restoration_border_.data(kPlaneY) +
+ row_offset_start * static_cast<ptrdiff_t>(
+ loop_restoration_border_.stride(kPlaneY)),
+ loop_restoration_border_.data(kPlaneU) +
+ row_offset_start * static_cast<ptrdiff_t>(
+ loop_restoration_border_.stride(kPlaneU)),
+ loop_restoration_border_.data(kPlaneV) +
+ row_offset_start * static_cast<ptrdiff_t>(
+ loop_restoration_border_.stride(kPlaneV))};
+ // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+ // directly with |loop_restoration_border_| as the destination. Otherwise,
+ // we simply copy the rows.
+ if (DoSuperRes()) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<int, kMaxPlanes> rows;
+ const int height = frame_header_.height;
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ rows[plane] = 0;
+ continue;
+ }
+ const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * static_cast<ptrdiff_t>(frame_buffer_.stride(plane));
+ rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+ } while (++plane < planes_);
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+ /*dst_is_loop_restoration_border=*/true);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ plane = kPlaneY;
+ do {
+ if (rows[plane] == 0 || rows[plane] >= 4) continue;
+ const ptrdiff_t stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+ const uint8_t* const src_line = dst_line - stride;
+ const int upscaled_width = super_res_info_[plane].upscaled_width
+ << pixel_size_log2_;
+ for (int i = rows[plane]; i < 4; ++i) {
+ memcpy(dst_line, src_line, upscaled_width);
+ dst_line += stride;
+ }
+ } while (++plane < planes_);
+ } else {
+ int plane = kPlaneY;
+ do {
+ CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+ } while (++plane < planes_);
+ }
+ // Extend the left and right boundaries needed for loop restoration.
+ const int upscaled_width = frame_header_.upscaled_width;
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ uint8_t* dst_line = dst[plane];
+ const int plane_width =
+ SubsampledValue(upscaled_width, subsampling_x_[plane]);
+ for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ {
+ ExtendLine<uint8_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ }
+ dst_line += loop_restoration_border_.stride(plane);
+ }
+ } while (++plane < planes_);
+ }
+}
+
+void PostFilter::RunJobs(WorkerFunction worker) {
+ std::atomic<int> row4x4(0);
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+ (this->*worker)(&row4x4);
+ pending_workers.Decrement();
+ });
+ }
+ // Run the jobs on the current thread.
+ (this->*worker)(&row4x4);
+ // Wait for the threadpool jobs to finish.
+ pending_workers.Wait();
+}
+
+void PostFilter::ApplyFilteringThreaded() {
+ if (DoDeblock()) {
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+ }
+ if (DoCdef() && DoRestoration()) {
+ for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
+ }
+ }
+ if (DoCdef()) {
+ for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupCdefBorder(row4x4);
+ }
+ RunJobs(&PostFilter::ApplyCdefWorker);
+ }
+ if (DoSuperRes()) ApplySuperResThreaded();
+ if (DoRestoration()) {
+ if (!DoCdef()) {
+ int row4x4 = 0;
+ do {
+ SetupLoopRestorationBorder(row4x4);
+ row4x4 += kNum4x4InLoopFilterUnit;
+ } while (row4x4 < frame_header_.rows4x4);
+ }
+ RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+ }
+ ExtendBordersForReferenceFrame();
+}
+
+int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
+ bool is_last_row,
+ bool do_deblock) {
+ if (row4x4 < 0) return -1;
+ if (DoDeblock() && do_deblock) {
+ VerticalDeblockFilter(row4x4, row4x4 + sb4x4, 0, frame_header_.columns4x4);
+ HorizontalDeblockFilter(row4x4, row4x4 + sb4x4, 0,
+ frame_header_.columns4x4);
+ }
+ if (DoRestoration() && DoCdef()) {
+ SetupLoopRestorationBorder(row4x4, sb4x4);
+ }
+ if (DoCdef()) {
+ ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+ }
+ if (DoSuperRes()) {
+ ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+ }
+ if (DoRestoration()) {
+ CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true);
+ ApplyLoopRestoration(row4x4, sb4x4);
+ if (is_last_row) {
+ // Loop restoration operates with a lag of 8 rows. So make sure to cover
+ // all the rows of the last superblock row.
+ CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true);
+ ApplyLoopRestoration(row4x4 + sb4x4, 16);
+ }
+ }
+ if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) {
+ CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false);
+ if (is_last_row) {
+ CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false);
+ }
+ }
+ if (is_last_row && !DoBorderExtensionInLoop()) {
+ ExtendBordersForReferenceFrame();
+ }
+ return is_last_row ? frame_header_.height : progress_row_;
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
+ const std::array<int, kMaxPlanes>& rows,
+ const int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border /*=false*/) {
+ int plane = kPlaneY;
+ do {
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+ auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+ const ptrdiff_t input_stride =
+ frame_buffer_.stride(plane) / sizeof(uint16_t);
+ const ptrdiff_t output_stride =
+ (dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane)) /
+ sizeof(uint16_t);
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, input_stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output, output_stride);
+ }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ auto* const line_buffer_start =
+ reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+ line_buffer_row * superres_line_buffer_.stride(plane) /
+ sizeof(uint16_t) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step,
+ output + rows[plane] * output_stride, /*dest_stride=*/0);
+ }
+ continue;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ uint8_t* input = src[plane];
+ uint8_t* output = dst[plane];
+ const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t output_stride = dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane);
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, input_stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output, output_stride);
+ }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(
+ superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output + rows[plane] * output_stride,
+ /*dest_stride=*/0);
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
+ bool is_last_row) {
+ assert(row4x4_start >= 0);
+ assert(DoSuperRes());
+ // If not doing cdef, then LR needs two rows of border with superres applied.
+ const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
+ std::array<int, kMaxPlanes> rows;
+ const int num_rows4x4 =
+ std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
+ (is_last_row ? 0 : 2);
+ if (row4x4_start > 0) {
+ const int row4x4 = row4x4_start - 2;
+ int plane = kPlaneY;
+ do {
+ const int row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+ const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| subtraction is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ // Apply superres for the last 8-|num_rows_extra| rows of the previous
+ // superblock.
+ rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
+ } else {
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t row_offset =
+ (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+ frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| addition is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
+ }
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+}
+
+void PostFilter::ApplySuperResThreaded() {
+ int num_threads = thread_pool_->num_threads() + 1;
+ // The number of rows that will be processed by each thread in the thread pool
+ // (other than the current thread).
+ int thread_pool_rows = frame_header_.height / num_threads;
+ thread_pool_rows = std::max(thread_pool_rows, 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++thread_pool_rows;
+ }
+ // Adjust the number of threads to what we really need.
+ num_threads = Clip3(frame_header_.height / thread_pool_rows, 1, num_threads);
+ // For the current thread, we round up to process all the remaining rows.
+ int current_thread_rows =
+ frame_header_.height - thread_pool_rows * (num_threads - 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++current_thread_rows;
+ }
+ assert(current_thread_rows > 0);
+ BlockingCounter pending_workers(num_threads - 1);
+ for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+ ++line_buffer_row, row_start += thread_pool_rows) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
+ std::array<int, kMaxPlanes> rows;
+ int plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
+ src[plane] =
+ GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ dst[plane] =
+ GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ rows[plane] =
+ (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+ : current_thread_rows) >>
+ subsampling_y_[plane]) -
+ 1;
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+ uint8_t* const input =
+ src[plane] + rows[plane] * frame_buffer_.stride(plane);
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ (kSuperResHorizontalBorder << pixel_size_log2);
+ memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+ } while (++plane < planes_);
+ if (line_buffer_row < num_threads - 1) {
+ thread_pool_->Schedule(
+ [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+ ApplySuperRes(src, rows, line_buffer_row, dst);
+ pending_workers.Decrement();
+ });
+ } else {
+ ApplySuperRes(src, rows, line_buffer_row, dst);
+ }
+ }
+ // Wait for the threadpool jobs to finish.
+ pending_workers.Wait();
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/super_res.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/threading_strategy.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kApplyCdefName[] = "ApplyCdef";
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+constexpr int kMaxTestFrameSize = 1920 * 1080;
+
+int GetIdFromInputParam(int subsampling_x, int subsampling_y, int height) {
+ int id = subsampling_x * 8 + subsampling_y * 4;
+ if (height == 288) {
+ id += 0;
+ } else if (height == 480) {
+ id += 1;
+ } else if (height == 1080) {
+ id += 2;
+ } else {
+ id += 3;
+ }
+ return id;
+}
+
+const char* GetSuperResDigest8bpp(int id, int plane) {
+ static const char* const kDigestSuperRes[][kMaxPlanes] = {
+ {
+ // all input is 0.
+ "ff5f7a63d3b1f9176e216eb01a0387ad", // kPlaneY.
+ "38b6551d7ac3e86c8af407d5a1aa36dc", // kPlaneU.
+ "38b6551d7ac3e86c8af407d5a1aa36dc", // kPlaneV.
+ },
+ {
+ // all input is 1.
+ "819f21dcce0e779180bbd613a9e3543c", // kPlaneY.
+ "e784bfa8f517d83b014c3dcd45b780a5", // kPlaneU.
+ "e784bfa8f517d83b014c3dcd45b780a5", // kPlaneV.
+ },
+ {
+ // all input is 128.
+ "2d6ea5b39f9168d56c2e2b8846d208ec", // kPlaneY.
+ "8030b6e70f1544efbc37b902d3f88bd3", // kPlaneU.
+ "8030b6e70f1544efbc37b902d3f88bd3", // kPlaneV.
+ },
+ {
+ // all input is 255.
+ "5c0b4bc50e0980dc6ba7c042d3b50a5e", // kPlaneY.
+ "3c566ef847c45be09ddac297123a3bad", // kPlaneU.
+ "3c566ef847c45be09ddac297123a3bad", // kPlaneV.
+ },
+ {
+ // random input.
+ "50514467dd6a5c3a8268eddaa542c41f", // kPlaneY.
+ "3ce720c2b5b44928e1477b11040e5c00", // kPlaneU.
+ "3ce720c2b5b44928e1477b11040e5c00", // kPlaneV.
+ },
+ };
+ return kDigestSuperRes[id][plane];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetSuperResDigest10bpp(int id, int plane) {
+ // Digests are in Y/U/V order.
+ static const char* const kDigestSuperRes[][kMaxPlanes] = {
+ {
+ // all input is 0.
+ "fccb1f57b252b1a86d335aea929d1d58",
+ "2f244a56091c9705794e92e6bcc38058",
+ "2f244a56091c9705794e92e6bcc38058",
+ },
+ {
+ // all input is 1.
+ "de8556204999d6e4bf74cfdde61a095b",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ },
+ {
+ // all input is 512.
+ "d3b6980363eb9b808885537b3485af87",
+ "bcffddb26210da6861e7b31414e58b77",
+ "bcffddb26210da6861e7b31414e58b77",
+ },
+ {
+ // all input is 1023.
+ "ce0762aeee1cdef1db101e4ca39bcbd6",
+ "33aeaa7f5d7c032e3dfda43925c3dcb2",
+ "33aeaa7f5d7c032e3dfda43925c3dcb2",
+ },
+ {
+ // random input.
+ "63c701bceb187ffa535be15ae58f8171",
+ "f570e30e9ea8d2a1e6d99202cd2f8994",
+ "f570e30e9ea8d2a1e6d99202cd2f8994",
+ },
+ };
+ return kDigestSuperRes[id][plane];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetSuperResDigest12bpp(int id, int plane) {
+ // Digests are in Y/U/V order.
+ static const char* const kDigestSuperRes[][kMaxPlanes] = {
+ {
+ // all input is 0.
+ "fccb1f57b252b1a86d335aea929d1d58",
+ "2f244a56091c9705794e92e6bcc38058",
+ "2f244a56091c9705794e92e6bcc38058",
+ },
+ {
+ // all input is 1.
+ "de8556204999d6e4bf74cfdde61a095b",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ },
+ {
+ // all input is 2048.
+ "83d600a7b3dc9bc3f710668ee2244e6b",
+ "468eec1453edc1befeb8a346f61950a7",
+ "468eec1453edc1befeb8a346f61950a7",
+ },
+ {
+ // all input is 4095.
+ "30bdb1dfee2b02b12b38e6b9f6287e27",
+ "34d673f075d2caa93a2f648ee3569e20",
+ "34d673f075d2caa93a2f648ee3569e20",
+ },
+ {
+ // random input.
+ "f10f21f5322231d991550fce7ef9787d",
+ "a2d8b6140bd5002e86644ef433b8eb42",
+ "a2d8b6140bd5002e86644ef433b8eb42",
+ },
+ };
+ return kDigestSuperRes[id][plane];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+// This type is used to parameterize the tests so is defined outside the
+// anonymous namespace to avoid the GCC -Wsubobject-linkage warning.
+struct FrameSizeParam {
+ FrameSizeParam(uint32_t width, uint32_t upscaled_width, uint32_t height,
+ int8_t ss_x, int8_t ss_y)
+ : width(width),
+ upscaled_width(upscaled_width),
+ height(height),
+ subsampling_x(ss_x),
+ subsampling_y(ss_y) {}
+ uint32_t width;
+ uint32_t upscaled_width;
+ uint32_t height;
+ int8_t subsampling_x;
+ int8_t subsampling_y;
+};
+
+// Print operators must be defined in the same namespace as the type for the
+// lookup to work correctly.
+static std::ostream& operator<<(std::ostream& os, const FrameSizeParam& param) {
+ return os << param.width << "x" << param.height
+ << ", upscaled_width: " << param.upscaled_width
+ << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+ << "/" << static_cast<int>(param.subsampling_y);
+}
+
+// Note the following test classes access private functions/members of
+// PostFilter. To be declared friends of PostFilter they must not have internal
+// linkage (they must be outside the anonymous namespace).
+template <int bitdepth, typename Pixel>
+class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ PostFilterTestBase() = default;
+ PostFilterTestBase(const PostFilterTestBase&) = delete;
+ PostFilterTestBase& operator=(const PostFilterTestBase&) = delete;
+ ~PostFilterTestBase() override = default;
+
+ void SetUp() override {
+ // Allocate buffer_ with a border size of kBorderPixels (which is
+ // subsampled for chroma planes). Some tests (for loop restoration) only use
+ // the nearest 2 or 3 pixels (for both luma and chroma planes) in the
+ // border.
+ ASSERT_TRUE(buffer_.Realloc(
+ bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+ frame_size_.height, frame_size_.subsampling_x,
+ frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr));
+
+ ASSERT_TRUE(loop_restoration_border_.Realloc(
+ bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+ frame_size_.height, frame_size_.subsampling_x,
+ frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr));
+
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int8_t subsampling_x =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+ const int8_t subsampling_y =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+ width_[plane] = frame_size_.width >> subsampling_x;
+ upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+ stride_[plane] =
+ (frame_size_.upscaled_width + 2 * kBorderPixels) >> subsampling_x;
+ height_[plane] =
+ (frame_size_.height + 2 * kBorderPixels) >> subsampling_y;
+
+ reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+ reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+ std::fill(reference_buffer_[plane].begin(),
+ reference_buffer_[plane].end(), 0);
+ }
+ }
+
+ protected:
+ YuvBuffer buffer_;
+ YuvBuffer cdef_border_;
+ YuvBuffer loop_restoration_border_;
+ uint32_t width_[kMaxPlanes];
+ uint32_t upscaled_width_[kMaxPlanes];
+ uint32_t stride_[kMaxPlanes];
+ uint32_t height_[kMaxPlanes];
+ std::vector<Pixel> reference_buffer_[kMaxPlanes];
+ const FrameSizeParam frame_size_ = GetParam();
+};
+
+template <int bitdepth, typename Pixel>
+class PostFilterHelperFuncTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ PostFilterHelperFuncTest() = default;
+ PostFilterHelperFuncTest(const PostFilterHelperFuncTest&) = delete;
+ PostFilterHelperFuncTest& operator=(const PostFilterHelperFuncTest&) = delete;
+ ~PostFilterHelperFuncTest() override = default;
+
+ protected:
+ using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+ using PostFilterTestBase<bitdepth, Pixel>::cdef_border_;
+ using PostFilterTestBase<bitdepth, Pixel>::loop_restoration_border_;
+ using PostFilterTestBase<bitdepth, Pixel>::width_;
+ using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+ using PostFilterTestBase<bitdepth, Pixel>::stride_;
+ using PostFilterTestBase<bitdepth, Pixel>::height_;
+ using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+ using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+ void SetUp() override {
+ PostFilterTestBase<bitdepth, Pixel>::SetUp();
+
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int8_t subsampling_x =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+ const int8_t subsampling_y =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+ width_[plane] = frame_size_.width >> subsampling_x;
+ upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+ stride_[plane] = (frame_size_.upscaled_width >> subsampling_x) +
+ 2 * kRestorationHorizontalBorder;
+ height_[plane] = (frame_size_.height >> subsampling_y) +
+ 2 * kRestorationVerticalBorder;
+ reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+ reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+ std::fill(reference_buffer_[plane].begin(),
+ reference_buffer_[plane].end(), 0);
+ buffer_border_corner_[plane] =
+ reinterpret_cast<Pixel*>(buffer_.data(plane)) -
+ buffer_.stride(plane) / sizeof(Pixel) * kRestorationVerticalBorder -
+ kRestorationHorizontalBorder;
+ loop_restoration_border_corner_[plane] =
+ reinterpret_cast<Pixel*>(loop_restoration_border_.data(plane)) -
+ loop_restoration_border_.stride(plane) / sizeof(Pixel) *
+ kRestorationVerticalBorder -
+ kRestorationHorizontalBorder;
+ }
+ }
+
+ void TestExtendFrame(bool use_fixed_values, Pixel value);
+ void TestAdjustFrameBufferPointer();
+ void TestPrepareLoopRestorationBlock();
+
+ // Fill the frame buffer with either a fixed value, or random values.
+ // If fill in with random values, make special operations at buffer
+ // boundaries. Make the outer most 3 pixel wide borders the same value
+ // as their immediate inner neighbor. For example:
+ // 4 4 4 4 5 6 6 6 6
+ // 4 4 4 4 5 6 6 6 6
+ // 4 4 4 4 5 6 6 6 6
+ // ---------
+ // 4 4 4 | 4 5 6 | 6 6 6
+ // 1 1 1 | 1 0 1 | 1 1 1
+ // 0 0 0 | 0 1 0 | 0 0 0
+ // 1 1 1 | 1 0 1 | 1 1 1
+ // 0 0 0 | 0 1 0 | 0 0 0
+ // 6 6 6 | 6 5 4 | 4 4 4
+ // -------
+ // 6 6 6 6 5 4 4 4 4
+ // 6 6 6 6 5 4 4 4 4
+ // 6 6 6 6 5 4 4 4 4
+ // Pixels within box is the current block. Outside is extended area from it.
+ void FillBuffer(bool use_fixed_values, Pixel value);
+
+ // Points to the upper left corner of the restoration border in buffer_.
+ Pixel* buffer_border_corner_[kMaxPlanes];
+ // Points to the upper left corner of the restoration border in
+ // loop_restoration_border_.
+ Pixel* loop_restoration_border_corner_[kMaxPlanes];
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::FillBuffer(
+ bool use_fixed_values, Pixel value) {
+ if (use_fixed_values) {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ // Fill buffer with a fixed value.
+ std::fill(reference_buffer_[plane].begin(),
+ reference_buffer_[plane].end(), value);
+ // Fill frame buffer. Note that the border is not filled.
+ auto* row = reinterpret_cast<Pixel*>(buffer_.data(plane));
+ for (int i = 0; i < buffer_.height(plane); ++i) {
+ std::fill(row, row + width_[plane], value);
+ row += buffer_.stride(plane) / sizeof(Pixel);
+ }
+ }
+ } else { // Random value.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int mask = (1 << bitdepth) - 1;
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ // Fill buffer with random values.
+ std::vector<Pixel> line_buffer(stride_[plane]);
+ std::fill(line_buffer.begin(), line_buffer.end(), 0);
+ for (int i = kRestorationHorizontalBorder;
+ i < stride_[plane] - kRestorationHorizontalBorder; ++i) {
+ line_buffer[i] = rnd.Rand16() & mask;
+ }
+ // Copy boundary values to extended border.
+ for (int i = 0; i < kRestorationHorizontalBorder; ++i) {
+ line_buffer[i] = line_buffer[kRestorationHorizontalBorder];
+ line_buffer[stride_[plane] - i - 1] =
+ line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+ }
+ // The first three rows are the same as the line_buffer.
+ for (int i = 0; i < kRestorationVerticalBorder + 1; ++i) {
+ std::copy(line_buffer.begin(), line_buffer.end(),
+ reference_buffer_[plane].begin() + i * stride_[plane]);
+ }
+ for (int i = kRestorationVerticalBorder + 1;
+ i < height_[plane] - kRestorationVerticalBorder; ++i) {
+ for (int j = kRestorationHorizontalBorder;
+ j < stride_[plane] - kRestorationHorizontalBorder; ++j) {
+ line_buffer[j] = rnd.Rand16() & mask;
+ }
+ for (int j = 0; j < kRestorationHorizontalBorder; ++j) {
+ line_buffer[j] = line_buffer[kRestorationHorizontalBorder];
+ line_buffer[stride_[plane] - j - 1] =
+ line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+ }
+ std::copy(line_buffer.begin(), line_buffer.end(),
+ reference_buffer_[plane].begin() + i * stride_[plane]);
+ }
+ // The extended border are the same as the line_buffer.
+ for (int i = 0; i < kRestorationVerticalBorder; ++i) {
+ std::copy(line_buffer.begin(), line_buffer.end(),
+ reference_buffer_[plane].begin() +
+ (height_[plane] - kRestorationVerticalBorder + i) *
+ stride_[plane]);
+ }
+
+ // Fill frame buffer. Note that the border is not filled.
+ for (int i = 0; i < buffer_.height(plane); ++i) {
+ memcpy(buffer_.data(plane) + i * buffer_.stride(plane),
+ reference_buffer_[plane].data() + kRestorationHorizontalBorder +
+ (i + kRestorationVerticalBorder) * stride_[plane],
+ sizeof(Pixel) * width_[plane]);
+ }
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::TestExtendFrame(
+ bool use_fixed_values, Pixel value) {
+ ObuFrameHeader frame_header = {};
+ frame_header.upscaled_width = frame_size_.upscaled_width;
+ frame_header.width = frame_size_.width;
+ frame_header.height = frame_size_.height;
+ ObuSequenceHeader sequence_header;
+ sequence_header.color_config.bitdepth = bitdepth;
+ sequence_header.color_config.is_monochrome = false;
+ sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+ sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ FrameScratchBuffer frame_scratch_buffer;
+
+ PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+ &buffer_, dsp,
+ /*do_post_filter_mask=*/0x00);
+ FillBuffer(use_fixed_values, value);
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int plane_width =
+ plane == kPlaneY ? frame_header.upscaled_width
+ : frame_header.upscaled_width >>
+ sequence_header.color_config.subsampling_x;
+ const int plane_height =
+ plane == kPlaneY
+ ? frame_header.height
+ : frame_header.height >> sequence_header.color_config.subsampling_y;
+ PostFilter::ExtendFrame<Pixel>(
+ reinterpret_cast<Pixel*>(buffer_.data(plane)), plane_width,
+ plane_height, buffer_.stride(plane) / sizeof(Pixel),
+ kRestorationHorizontalBorder, kRestorationHorizontalBorder,
+ kRestorationVerticalBorder, kRestorationVerticalBorder);
+ const bool success = test_utils::CompareBlocks<Pixel>(
+ buffer_border_corner_[plane], reference_buffer_[plane].data(),
+ stride_[plane], height_[plane], buffer_.stride(plane) / sizeof(Pixel),
+ stride_[plane], /*check_padding=*/false, /*print_diff=*/false);
+ ASSERT_TRUE(success) << "Failure of extend frame at plane: " << plane;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+class PostFilterSuperResTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ PostFilterSuperResTest() {
+ test_utils::ResetDspTable(bitdepth);
+ dsp::SuperResInit_C();
+ dsp::SuperResInit_SSE4_1();
+ dsp::SuperResInit_NEON();
+ }
+ PostFilterSuperResTest(const PostFilterSuperResTest&) = delete;
+ PostFilterSuperResTest& operator=(const PostFilterSuperResTest&) = delete;
+ ~PostFilterSuperResTest() override = default;
+
+ protected:
+ using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+ using PostFilterTestBase<bitdepth, Pixel>::width_;
+ using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+ using PostFilterTestBase<bitdepth, Pixel>::stride_;
+ using PostFilterTestBase<bitdepth, Pixel>::height_;
+ using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+ using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+ void TestApplySuperRes(bool use_fixed_values, Pixel value, int id,
+ bool multi_threaded);
+};
+
+// This class must be in namespace libgav1 to access private member function
+// of class PostFilter in src/post_filter.h.
+template <int bitdepth, typename Pixel>
+void PostFilterSuperResTest<bitdepth, Pixel>::TestApplySuperRes(
+ bool use_fixed_values, Pixel value, int id, bool multi_threaded) {
+ ObuFrameHeader frame_header = {};
+ frame_header.width = frame_size_.width;
+ frame_header.upscaled_width = frame_size_.upscaled_width;
+ frame_header.height = frame_size_.height;
+ frame_header.rows4x4 = DivideBy4(frame_size_.height);
+ frame_header.columns4x4 = DivideBy4(frame_size_.width);
+ frame_header.tile_info.tile_count = 1;
+ ObuSequenceHeader sequence_header;
+ sequence_header.color_config.bitdepth = bitdepth;
+ sequence_header.color_config.is_monochrome = false;
+ sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+ sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+ // Apply SuperRes.
+ Array2D<int16_t> cdef_index;
+ Array2D<TransformSize> inter_transform_sizes;
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ constexpr int kNumThreads = 4;
+ FrameScratchBuffer frame_scratch_buffer;
+ if (multi_threaded) {
+ ASSERT_TRUE(frame_scratch_buffer.threading_strategy.Reset(frame_header,
+ kNumThreads));
+ }
+ const int pixel_size = sequence_header.color_config.bitdepth == 8
+ ? sizeof(uint8_t)
+ : sizeof(uint16_t);
+ ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeY].Resize(
+ kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+ pixel_size));
+ if (!sequence_header.color_config.is_monochrome &&
+ sequence_header.color_config.subsampling_x != 0) {
+ ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeUV].Resize(
+ kSuperResFilterTaps *
+ Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+ pixel_size));
+ }
+ ASSERT_TRUE(frame_scratch_buffer.superres_line_buffer.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), (multi_threaded ? kNumThreads : 1),
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+ 2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+ nullptr, nullptr, nullptr));
+ PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+ &buffer_, dsp,
+ /*do_post_filter_mask=*/0x04);
+
+ const int num_planes = sequence_header.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ int width[kMaxPlanes];
+ int upscaled_width[kMaxPlanes];
+ int height[kMaxPlanes];
+
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ const int8_t subsampling_x =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+ const int8_t subsampling_y =
+ (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+ width[plane] = frame_size_.width >> subsampling_x;
+ upscaled_width[plane] = frame_size_.upscaled_width >> subsampling_x;
+ height[plane] = frame_size_.height >> subsampling_y;
+ if (use_fixed_values) {
+ auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+ for (int y = 0; y < height[plane]; ++y) {
+ for (int x = 0; x < width[plane]; ++x) {
+ src[x] = value;
+ }
+ src += buffer_.stride(plane) / sizeof(Pixel);
+ }
+ } else { // Random input.
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+ for (int y = 0; y < height[plane]; ++y) {
+ for (int x = 0; x < width[plane]; ++x) {
+ src[x] = rnd.Rand16() & mask;
+ }
+ src += buffer_.stride(plane) / sizeof(Pixel);
+ }
+ }
+ }
+
+ if (multi_threaded) {
+ post_filter.ApplySuperResThreaded();
+ } else {
+ std::array<uint8_t*, kMaxPlanes> buffers = {
+ post_filter.cdef_buffer_[kPlaneY], post_filter.cdef_buffer_[kPlaneU],
+ post_filter.cdef_buffer_[kPlaneV]};
+ std::array<uint8_t*, kMaxPlanes> dst = {
+ post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneY), 0, 0),
+ post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneU), 0, 0),
+ post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneV), 0, 0)};
+ std::array<int, kMaxPlanes> rows = {
+ frame_header.rows4x4 * 4,
+ (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y,
+ (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y};
+ post_filter.ApplySuperRes(buffers, rows, /*line_buffer_row=*/-1, dst);
+ }
+
+ // Check md5.
+ std::vector<Pixel> output;
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
+ output.reserve(upscaled_width[plane] * height[plane]);
+ output.resize(upscaled_width[plane] * height[plane]);
+ auto* dst = reinterpret_cast<Pixel*>(
+ post_filter.GetSuperResBuffer(static_cast<Plane>(plane), 0, 0));
+ for (int y = 0; y < height[plane]; ++y) {
+ for (int x = 0; x < upscaled_width[plane]; ++x) {
+ output[y * upscaled_width[plane] + x] = dst[x];
+ }
+ dst += buffer_.stride(plane) / sizeof(Pixel);
+ }
+ const std::string digest = test_utils::GetMd5Sum(
+ output.data(), upscaled_width[plane] * height[plane] * sizeof(Pixel));
+ printf("MD5: %s\n", digest.c_str());
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetSuperResDigest8bpp(id, plane);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetSuperResDigest10bpp(id, plane);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetSuperResDigest12bpp(id, plane);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ EXPECT_STREQ(digest.c_str(), expected_digest);
+ }
+}
+
+using PostFilterSuperResTest8bpp = PostFilterSuperResTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamSuperRes[] = {
+ FrameSizeParam(176, 352, 288, 1, 1)};
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperRes) {
+ TestApplySuperRes(true, 0, 0, false);
+ TestApplySuperRes(true, 1, 1, false);
+ TestApplySuperRes(true, 128, 2, false);
+ TestApplySuperRes(true, 255, 3, false);
+ TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperResThreaded) {
+ TestApplySuperRes(true, 0, 0, true);
+ TestApplySuperRes(true, 1, 1, true);
+ TestApplySuperRes(true, 128, 2, true);
+ TestApplySuperRes(true, 255, 3, true);
+ TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+ PostFilterSuperResTest8bpp,
+ testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest8bpp = PostFilterHelperFuncTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamExtendFrame[] = {
+ FrameSizeParam(16, 16, 16, 1, 1),
+ FrameSizeParam(64, 64, 64, 1, 1),
+ FrameSizeParam(128, 128, 64, 1, 1),
+ FrameSizeParam(64, 64, 128, 1, 1),
+ FrameSizeParam(352, 352, 288, 1, 1),
+ FrameSizeParam(720, 720, 480, 1, 1),
+ FrameSizeParam(1080, 1080, 720, 1, 1),
+ FrameSizeParam(16, 16, 16, 0, 0),
+ FrameSizeParam(64, 64, 64, 0, 0),
+ FrameSizeParam(128, 128, 64, 0, 0),
+ FrameSizeParam(64, 64, 128, 0, 0),
+ FrameSizeParam(352, 352, 288, 0, 0),
+ FrameSizeParam(720, 720, 480, 0, 0),
+ FrameSizeParam(1080, 1080, 720, 0, 0)};
+
+TEST_P(PostFilterHelperFuncTest8bpp, ExtendFrame) {
+ TestExtendFrame(true, 0);
+ TestExtendFrame(true, 1);
+ TestExtendFrame(true, 128);
+ TestExtendFrame(true, 255);
+ TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+ PostFilterHelperFuncTest8bpp,
+ testing::ValuesIn(kTestParamExtendFrame));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterSuperResTest10bpp = PostFilterSuperResTest<10, uint16_t>;
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperRes) {
+ TestApplySuperRes(true, 0, 0, false);
+ TestApplySuperRes(true, 1, 1, false);
+ TestApplySuperRes(true, 1 << 9, 2, false);
+ TestApplySuperRes(true, (1 << 10) - 1, 3, false);
+ TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperResThreaded) {
+ TestApplySuperRes(true, 0, 0, true);
+ TestApplySuperRes(true, 1, 1, true);
+ TestApplySuperRes(true, 1 << 9, 2, true);
+ TestApplySuperRes(true, (1 << 10) - 1, 3, true);
+ TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+ PostFilterSuperResTest10bpp,
+ testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest10bpp = PostFilterHelperFuncTest<10, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest10bpp, ExtendFrame) {
+ TestExtendFrame(true, 0);
+ TestExtendFrame(true, 1);
+ TestExtendFrame(true, 255);
+ TestExtendFrame(true, (1 << 10) - 1);
+ TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+ PostFilterHelperFuncTest10bpp,
+ testing::ValuesIn(kTestParamExtendFrame));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterSuperResTest12bpp = PostFilterSuperResTest<12, uint16_t>;
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperRes) {
+ TestApplySuperRes(true, 0, 0, false);
+ TestApplySuperRes(true, 1, 1, false);
+ TestApplySuperRes(true, 1 << 11, 2, false);
+ TestApplySuperRes(true, (1 << 12) - 1, 3, false);
+ TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperResThreaded) {
+ TestApplySuperRes(true, 0, 0, true);
+ TestApplySuperRes(true, 1, 1, true);
+ TestApplySuperRes(true, 1 << 11, 2, true);
+ TestApplySuperRes(true, (1 << 12) - 1, 3, true);
+ TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+ PostFilterSuperResTest12bpp,
+ testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest12bpp = PostFilterHelperFuncTest<12, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest12bpp, ExtendFrame) {
+ TestExtendFrame(true, 0);
+ TestExtendFrame(true, 1);
+ TestExtendFrame(true, 255);
+ TestExtendFrame(true, (1 << 12) - 1);
+ TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+ PostFilterHelperFuncTest12bpp,
+ testing::ValuesIn(kTestParamExtendFrame));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+namespace {
+
+const char* GetDigestApplyCdef8bpp(int id) {
+ static const char* const kDigest[] = {
+ "9593af24f9c6faecce53437f6e128edf", "ecb633cc2ecd6e7e0cf39d4439f4a6ea",
+ "9ec4cb4124f0a686a7bda72b447f5b8e", "7ebd859a23162bc864a69dbea60bc687",
+ "de7a15fc00664692a794aa68cf695980", "cf3fc8fe041f68d31ab4e34ad3643541",
+ "94c116b191b0268cf7ab4a0e6996e1ec", "1ad60c943a5a914aba7bc26706620a05",
+ "ce33c6f80e3608c4d18c49be2e393c20", "e140586ffc663798b74b8f6fb5b44736",
+ "b7379bba8bcb97f09a74655f4e0eee91", "02ce174061c98babd3987461b3984e47",
+ "64655dd1dfba8317e27d2fdcb211b7b4", "eeb6a61c70c5ee75a4c31dc5099b4dfb",
+ "ee944b31148fa2e30938084f7c046464", "db7b63497750fa4c51cf45c56a2da01c",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigestApplyCdef10bpp(int id) {
+ static const char* const kDigest[] = {
+ "53f8d68ac7f3aea65151b2066f8501c9", "021e70d5406fa182dd9713380eb66d1d",
+ "bab1c84e7f06b87d81617d2d0a194b89", "58e302ff0522f64901909fb97535b270",
+ "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+ "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+ "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+ "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+ "6f0299645cd6f0655fd26044cd43a37c", "56d7febf5bbebdc82e8f157ab926a0bb",
+ "f54654f11006453f496be5883216a3bb", "9abc6e3230792ba78bcc65504a62075e",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigestApplyCdef12bpp(int id) {
+ static const char* const kDigest[] = {
+ "06e2d09b6ce3924f3b5d4c00ab76eea5", "287240e4b13cb75e17932a3dd7ba3b3c",
+ "265da123e3347c4fb3e434f26a3949e7", "e032ce6eb76242df6894482ac6688406",
+ "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+ "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+ "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+ "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+ "155dd4283f8037f86cce34b6cfe67a7e", "0a022c70ead199517af9bad2002d70cd",
+ "a966dfea52a7a2084545f68b2c9e1735", "e098438a23a7c9f276e594b98b2db922",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace
+
+template <int bitdepth, typename Pixel>
+class PostFilterApplyCdefTest : public testing::TestWithParam<FrameSizeParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
+ PostFilterApplyCdefTest() = default;
+ PostFilterApplyCdefTest(const PostFilterApplyCdefTest&) = delete;
+ PostFilterApplyCdefTest& operator=(const PostFilterApplyCdefTest&) = delete;
+ ~PostFilterApplyCdefTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ dsp::CdefInit_C();
+ dsp::CdefInit_SSE4_1();
+ dsp::CdefInit_NEON();
+
+ dsp_ = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp_, nullptr);
+ }
+
+ // Sets sequence_header_, frame_header_, cdef_index_ and cdef_skip_.
+ // Allocates yuv_buffer_ but does not set it.
+ void SetInput(libvpx_test::ACMRandom* rnd);
+ // Sets yuv_buffer_.
+ void SetInputBuffer(libvpx_test::ACMRandom* rnd, PostFilter* post_filter);
+ void CopyFilterOutputToDestBuffer();
+ void TestMultiThread(int num_threads);
+
+ ObuSequenceHeader sequence_header_;
+ ObuFrameHeader frame_header_ = {};
+ FrameScratchBuffer frame_scratch_buffer_;
+ YuvBuffer yuv_buffer_;
+ const dsp::Dsp* dsp_;
+ FrameSizeParam param_ = GetParam();
+ Pixel dest_[kMaxTestFrameSize * kMaxPlanes];
+ const size_t y_size_ = param_.width * param_.height;
+ const size_t uv_size_ = y_size_ >>
+ (param_.subsampling_x + param_.subsampling_y);
+ const size_t size_ = y_size_ + uv_size_ * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInput(
+ libvpx_test::ACMRandom* rnd) {
+ sequence_header_.color_config.bitdepth = bitdepth;
+ sequence_header_.color_config.subsampling_x = param_.subsampling_x;
+ sequence_header_.color_config.subsampling_y = param_.subsampling_y;
+ sequence_header_.color_config.is_monochrome = false;
+ sequence_header_.use_128x128_superblock =
+ static_cast<bool>(rnd->Rand16() & 1);
+
+ ASSERT_TRUE(param_.width <= param_.upscaled_width);
+ ASSERT_TRUE(param_.upscaled_width * param_.height <= kMaxTestFrameSize)
+ << "Please adjust the max frame size.";
+
+ frame_header_.width = param_.width;
+ frame_header_.upscaled_width = param_.upscaled_width;
+ frame_header_.height = param_.height;
+ frame_header_.columns4x4 = DivideBy4(Align(frame_header_.width, 8));
+ frame_header_.rows4x4 = DivideBy4(Align(frame_header_.height, 8));
+ frame_header_.tile_info.tile_count = 1;
+ frame_header_.refresh_frame_flags = 0;
+ Cdef* const cdef = &frame_header_.cdef;
+ const int coeff_shift = bitdepth - 8;
+ do {
+ cdef->damping = (rnd->Rand16() & 3) + 3 + coeff_shift;
+ cdef->bits = rnd->Rand16() & 3;
+ } while (cdef->bits <= 0);
+ for (int i = 0; i < (1 << cdef->bits); ++i) {
+ cdef->y_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+ cdef->y_secondary_strength[i] = rnd->Rand16() & 3;
+ if (cdef->y_secondary_strength[i] == 3) {
+ ++cdef->y_secondary_strength[i];
+ }
+ cdef->y_secondary_strength[i] <<= coeff_shift;
+ cdef->uv_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+ cdef->uv_secondary_strength[i] = rnd->Rand16() & 3;
+ if (cdef->uv_secondary_strength[i] == 3) {
+ ++cdef->uv_secondary_strength[i];
+ }
+ cdef->uv_secondary_strength[i] <<= coeff_shift;
+ }
+
+ const int rows64x64 = DivideBy16(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+ const int columns64x64 =
+ DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+ ASSERT_TRUE(frame_scratch_buffer_.cdef_index.Reset(rows64x64, columns64x64));
+ for (int row = 0; row < rows64x64; ++row) {
+ for (int column = 0; column < columns64x64; ++column) {
+ frame_scratch_buffer_.cdef_index[row][column] =
+ rnd->Rand16() & ((1 << cdef->bits) - 1);
+ }
+ }
+
+ const int skip_rows = DivideBy2(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+ const int skip_columns =
+ DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+ ASSERT_TRUE(frame_scratch_buffer_.cdef_skip.Reset(skip_rows, skip_columns));
+ for (int row = 0; row < skip_rows; ++row) {
+ memset(frame_scratch_buffer_.cdef_skip[row], 0xFF, skip_columns);
+ }
+
+ ASSERT_TRUE(yuv_buffer_.Realloc(
+ sequence_header_.color_config.bitdepth,
+ sequence_header_.color_config.is_monochrome, frame_header_.upscaled_width,
+ frame_header_.height, sequence_header_.color_config.subsampling_x,
+ sequence_header_.color_config.subsampling_y, kBorderPixels, kBorderPixels,
+ kBorderPixels, kBorderPixels, nullptr, nullptr, nullptr))
+ << "Failed to allocate source buffer.";
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInputBuffer(
+ libvpx_test::ACMRandom* rnd, PostFilter* post_filter) {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+ const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+ const int plane_height =
+ MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+ auto* src =
+ reinterpret_cast<Pixel*>(post_filter->GetUnfilteredBuffer(plane));
+ const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+ for (int y = 0; y < plane_height; ++y) {
+ for (int x = 0; x < plane_width; ++x) {
+ src[x] = rnd->Rand16() & ((1 << bitdepth) - 1);
+ }
+ src += src_stride;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::CopyFilterOutputToDestBuffer() {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+ const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+ const int plane_width = SubsampledValue(param_.width, subsampling_x);
+ const int plane_height = SubsampledValue(param_.height, subsampling_y);
+ auto* src = reinterpret_cast<Pixel*>(yuv_buffer_.data(plane));
+ const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+ Pixel* dest_plane =
+ dest_ +
+ ((plane == 0) ? 0 : ((plane == 1) ? y_size_ : y_size_ + uv_size_));
+ for (int y = 0; y < plane_height; ++y) {
+ for (int x = 0; x < plane_width; ++x) {
+ dest_plane[y * plane_width + x] = src[x];
+ }
+ src += src_stride;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::TestMultiThread(
+ int num_threads) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ SetInput(&rnd);
+
+ ASSERT_TRUE(frame_scratch_buffer_.threading_strategy.Reset(frame_header_,
+ num_threads));
+ if (num_threads > 1) {
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header_.rows4x4, 4));
+ ASSERT_TRUE(frame_scratch_buffer_.cdef_border.Realloc(
+ bitdepth, /*is_monochrome=*/false,
+ MultiplyBy4(frame_header_.columns4x4), num_units,
+ sequence_header_.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr));
+ }
+
+ PostFilter post_filter(frame_header_, sequence_header_,
+ &frame_scratch_buffer_, &yuv_buffer_, dsp_,
+ /*do_post_filter_mask=*/0x02);
+ SetInputBuffer(&rnd, &post_filter);
+
+ const int id = GetIdFromInputParam(param_.subsampling_x, param_.subsampling_y,
+ param_.height);
+ absl::Duration elapsed_time;
+ const absl::Time start = absl::Now();
+
+ // Only ApplyCdef() and frame copy inside ApplyFilteringThreaded() are
+ // triggered, since we set the filter mask to 0x02.
+ post_filter.ApplyFilteringThreaded();
+ elapsed_time += absl::Now() - start;
+
+ CopyFilterOutputToDestBuffer();
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigestApplyCdef8bpp(id);
+ break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigestApplyCdef10bpp(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigestApplyCdef12bpp(id);
+ break;
+#endif
+ }
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(kCdef, kApplyCdefName, expected_digest, dest_,
+ size_, elapsed_time);
+}
+
+const FrameSizeParam kTestParamApplyCdef[] = {
+ FrameSizeParam(352, 352, 288, 0, 0), FrameSizeParam(720, 720, 480, 0, 0),
+ FrameSizeParam(1920, 1920, 1080, 0, 0), FrameSizeParam(251, 251, 187, 0, 0),
+ FrameSizeParam(352, 352, 288, 0, 1), FrameSizeParam(720, 720, 480, 0, 1),
+ FrameSizeParam(1920, 1920, 1080, 0, 1), FrameSizeParam(251, 251, 187, 0, 1),
+ FrameSizeParam(352, 352, 288, 1, 0), FrameSizeParam(720, 720, 480, 1, 0),
+ FrameSizeParam(1920, 1920, 1080, 1, 0), FrameSizeParam(251, 251, 187, 1, 0),
+ FrameSizeParam(352, 352, 288, 1, 1), FrameSizeParam(720, 720, 480, 1, 1),
+ FrameSizeParam(1920, 1920, 1080, 1, 1), FrameSizeParam(251, 251, 187, 1, 1),
+};
+
+using PostFilterApplyCdefTest8bpp = PostFilterApplyCdefTest<8, uint8_t>;
+
+TEST_P(PostFilterApplyCdefTest8bpp, ApplyCdef) {
+ TestMultiThread(2);
+ TestMultiThread(4);
+ TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+ PostFilterApplyCdefTest8bpp,
+ testing::ValuesIn(kTestParamApplyCdef));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterApplyCdefTest10bpp = PostFilterApplyCdefTest<10, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest10bpp, ApplyCdef) {
+ TestMultiThread(2);
+ TestMultiThread(4);
+ TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+ PostFilterApplyCdefTest10bpp,
+ testing::ValuesIn(kTestParamApplyCdef));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterApplyCdefTest12bpp = PostFilterApplyCdefTest<12, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest12bpp, ApplyCdef) {
+ TestMultiThread(2);
+ TestMultiThread(4);
+ TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+ PostFilterApplyCdefTest12bpp,
+ testing::ValuesIn(kTestParamApplyCdef));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+ kWedgeHorizontal,
+ kWedgeVertical,
+ kWedgeOblique27,
+ kWedgeOblique63,
+ kWedgeOblique117,
+ kWedgeOblique153,
+};
+
+constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeHorizontal, 4, 2},
+ {kWedgeHorizontal, 4, 4},
+ {kWedgeHorizontal, 4, 6},
+ {kWedgeVertical, 4, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}},
+ {{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeVertical, 2, 4},
+ {kWedgeVertical, 4, 4},
+ {kWedgeVertical, 6, 4},
+ {kWedgeHorizontal, 4, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}},
+ {{kWedgeOblique27, 4, 4},
+ {kWedgeOblique63, 4, 4},
+ {kWedgeOblique117, 4, 4},
+ {kWedgeOblique153, 4, 4},
+ {kWedgeHorizontal, 4, 2},
+ {kWedgeHorizontal, 4, 6},
+ {kWedgeVertical, 2, 4},
+ {kWedgeVertical, 6, 4},
+ {kWedgeOblique27, 4, 2},
+ {kWedgeOblique27, 4, 6},
+ {kWedgeOblique153, 4, 2},
+ {kWedgeOblique153, 4, 6},
+ {kWedgeOblique63, 2, 4},
+ {kWedgeOblique63, 6, 4},
+ {kWedgeOblique117, 2, 4},
+ {kWedgeOblique117, 6, 4}}};
+
+constexpr BitMaskSet kWedgeFlipSignMasks[9] = {
+ BitMaskSet(0xBBFF), // kBlock8x8
+ BitMaskSet(0xBBEF), // kBlock8x16
+ BitMaskSet(0xBAEF), // kBlock8x32
+ BitMaskSet(0xBBEF), // kBlock16x8
+ BitMaskSet(0xBBFF), // kBlock16x16
+ BitMaskSet(0xBBEF), // kBlock16x32
+ BitMaskSet(0xABEF), // kBlock32x8
+ BitMaskSet(0xBBEF), // kBlock32x16
+ BitMaskSet(0xBBFF) // kBlock32x32
+};
+
+// This table (and the one below) contains a few leading zeros and trailing 64s
+// to avoid some additional memcpys where it is actually used.
+constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, 37,
+ 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27,
+ 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21,
+ 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+int BlockShape(BlockSize block_size) {
+ const int width = kNum4x4BlocksWide[block_size];
+ const int height = kNum4x4BlocksHigh[block_size];
+ if (height > width) return 0;
+ if (height < width) return 1;
+ return 2;
+}
+
+uint8_t GetWedgeDirection(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][0];
+}
+
+uint8_t GetWedgeOffsetX(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][1];
+}
+
+uint8_t GetWedgeOffsetY(BlockSize block_size, int index) {
+ return kWedgeCodebook[BlockShape(block_size)][index][2];
+}
+
+} // namespace
+
+bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) {
+ // Generate master masks.
+ uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize];
+ for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+ memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical,
+ kWedgeMaskMasterSize);
+ }
+
+ for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) {
+ memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift,
+ kWedgeMaskMasterSize);
+ memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift,
+ kWedgeMaskMasterSize);
+ }
+
+ for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+ for (int x = 0; x < kWedgeMaskMasterSize; ++x) {
+ const uint8_t mask_value = master_mask[kWedgeOblique63][y][x];
+ master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x];
+ master_mask[kWedgeOblique27][x][y] = mask_value;
+ master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] =
+ 64 - mask_value;
+ master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] =
+ 64 - mask_value;
+ }
+ }
+
+ // Generate wedge masks.
+ int block_size_index = 0;
+ for (int size = kBlock8x8; size <= kBlock32x32; ++size) {
+ if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue;
+
+ const int width = kBlockWidthPixels[size];
+ const int height = kBlockHeightPixels[size];
+ assert(width >= 8);
+ assert(width <= 32);
+ assert(height >= 8);
+ assert(height <= 32);
+
+ const auto block_size = static_cast<BlockSize>(size);
+ for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes;
+ ++wedge_index) {
+ const uint8_t direction = GetWedgeDirection(block_size, wedge_index);
+ const uint8_t offset_x =
+ DivideBy2(kWedgeMaskMasterSize) -
+ ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3);
+ const uint8_t offset_y =
+ DivideBy2(kWedgeMaskMasterSize) -
+ ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3);
+
+ // Allocate the 2d array.
+ for (int flip_sign = 0; flip_sign < 2; ++flip_sign) {
+ if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset(
+ height, width, /*zero_initialize=*/false))) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks.");
+ return false;
+ }
+ }
+
+ const auto flip_sign = static_cast<uint8_t>(
+ kWedgeFlipSignMasks[block_size_index].Contains(wedge_index));
+ uint8_t* wedge_masks_row =
+ (*wedge_masks)[block_size_index][flip_sign][wedge_index][0];
+ uint8_t* wedge_masks_row_flip =
+ (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0];
+ uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x];
+ for (int y = 0; y < height; ++y) {
+ memcpy(wedge_masks_row, master_mask_row, width);
+ for (int x = 0; x < width; ++x) {
+ wedge_masks_row_flip[x] = 64 - wedge_masks_row[x];
+ }
+ wedge_masks_row += width;
+ wedge_masks_row_flip += width;
+ master_mask_row += kWedgeMaskMasterSize;
+ }
+ }
+
+ block_size_index++;
+ }
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_
+#define LIBGAV1_SRC_PREDICTION_MASK_H_
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16,
+ kBlock8x32, kBlock16x8,
+ kBlock16x16, kBlock16x32,
+ kBlock32x8, kBlock32x16,
+ kBlock32x32);
+
+// This function generates wedge masks. It should be called only once for the
+// decoder. If the video is key frame only, we don't have to call this
+// function. Returns true on success, false on allocation failure.
+// 7.11.3.11.
+bool GenerateWedgeMask(WedgeMaskArray* wedge_masks);
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_PREDICTION_MASK_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+ kWedgeHorizontal,
+ kWedgeVertical,
+ kWedgeOblique27,
+ kWedgeOblique63,
+ kWedgeOblique117,
+ kWedgeOblique153,
+};
+
+const char* const kExpectedWedgeMask[] = {
+ "cea09e4bf4227efef749672283f7369b", "2763ab02b70447b2f9d5ed4796ca33bc",
+ "8d83c4315eadda824893c3e79aa866d9", "a733fd7f143c1c6141983c5f816bb3d8",
+ "9a205bfca776ccde57a8031350f2f467", "d78b964719f52f302f4454df14e45e35",
+ "bdc3972cfeb44d0acebb49b2fcb76072", "c8872571833c165be99ada1c552bfd9b",
+ "26d2541e2f8efe48e2f4a1819b3a6896", "783871179337e78e5ef41a66c0c6937c",
+ "253d21c612d732fceedcf610c4ff099c", "c868d177dc2a2378ef362fa482f601e8",
+ "782d75e143d87cc1aeb5d040c48d3c2d", "718cbecf4db45c7d596eba07bd956601",
+ "3b60b9336c2cf699172eb4a3fef18787", "afe72d4bd206f1cb27e3736c3b0068cf",
+ "7b830a1a94bad23a1df1b8d9668708d0", "d3f421ff2b81686fd421f7c02622aac1",
+ "d9ac14dff8e3c415e85e99c3ce0fbd5b", "da493727a08773a950a0375881d912f2",
+ "2f4251fd1b4636a034e22611ea1223b6", "84f84f01900b8a894b19e353605846b0",
+ "bbf5dae73300b6a6789710ffc4fc59fd", "c711941a0889fbed9b926c1eb39a5616",
+ "2fcf270613df57a57e647f37bf9a19ec", "79ed9c2f828b765edf65027f1f0847f5",
+ "e8d3e821f4e7f2f39659071da8f2cc71", "823bb09e2c28f2a81bf8a2d030e8bab6",
+ "d598fb4f70ea6b705674497994aecbfa", "3737c39f058c57650be7e720dcd87aa1",
+ "eb1d9b1d30485d9870ca9380cbdfad43", "a23d3c24f291080fcd62c0a2a2aea181",
+ "968543d91aeae3b1814a5074b6aa9e8c", "6e2444d71a4f3ddfe643e72f9c3cf6c3",
+ "3bf78413aa04830849a3d9c7bfa41a84", "ece8306f9859bcfb042b0bda8f6750b6",
+ "608b29fcedb7fa054a599945b497c78c", "d69d622016872469dfbde4e589bfd679",
+ "38a2307174c27b634323c59da3339dc6", "5e44f0fad99dbe802ffd69c7dc239d56",
+ "a0eeaf3755a724fdf6469f43cb060d75", "7bcf8035c5057619ea8660c32802d6a1",
+ "6054e1c35fe13b9269ab01d1bc0d8848", "e0ec8f7c66ebabff60f5accd3d707788",
+ "0b9fd6e1053a706af5d0cd59dc7e1992", "709648ffab1992d8522b04ca23de577a",
+ "c576e378ed264d6cb00adfd3b4e428f1", "f6f3ae5348e7141775a8a6bc2be22f80",
+ "9289722adb38fa3b2fb775648f0cc3a8", "b7e02fa00b56aeea8e6098a92eac72e1",
+ "db2f6d66ffca8352271f1e3f0116838a", "5858c567b0719daaa364fb0e6d8aa5dc",
+ "db2d300f875d2465adabf4c1322cea6f", "05c66b54c4d32e5b64a7e77e751f0c51",
+ "f2c2a5a3ce510d21ef2e62eedba85afb", "3959d2191a11e800289e21fd283b2837",
+ "cc86023d079a4c5daadce8ad0cdd176f", "e853f3c6814a653a52926488184aae5e",
+ "8568b9d7215bb8dfb1b7ce66ef38e055", "42814ac5ed652afb4734465cca9e038c",
+ "dba6b7d5e93e6a20dac9a514824ad45c", "be77e0dce733b564e96024ea23c9db43",
+ "2aa7bd75a1d8eb1000f0ef9e19aa0d1d", "226d85741e3f35493e971dd13b689ec7",
+ "9e5a0cf4416f8afeaa3ddbe686b5b7db", "18389c77b362f6b4b727b99426251159",
+ "10c5d899de999bbdf35839be3f2d5ee3", "942ae479a36fb4b4d359bebd78a92f03",
+ "f14e4dd174958e16755cd1f456b083e0", "8a036cbd0aaf1bece25a1140109f688b",
+ "2e48eade95f9fa0b7dae147e66d83e13", "4387d723350a011e26b0e91bbeb3d7c2",
+ "5470f977d859232335945efc8bb49ff1", "6780fd81cf2561300c75c930e715c7a6",
+ "9786aca6b1b9abfc3eae51404bc3cbd5", "da65c1440fa370a0237284bf30e56b0b",
+ "8e0d5d83ab3c477fd11ef143a832f7bf", "97489c7a47aa69fef091e7e6e4049a8f",
+ "28787beac9e69001c2999976742764a3", "67760c48ff5f7bc50cd92727694ba271",
+ "57c2b0b7de5de0f40fb739ed095d82a4", "7b2a663ca7da4b73f1adfc7e0ca1eff1",
+ "980869e1795efb63ca623ce2f0043fb3", "575497eb213b05bab24017cc6ea4e56a",
+ "ca3b31382439f0bdd87b61fa10c7863b", "72c65bf29afb288f4d4ff51816429aa7",
+ "1fe8929387be982993cd2309e3eeae7a", "994246e2585179e00f49537713f33796",
+ "82ae324ba01002370e918724ce452738", "fb3bcb4811b8251f0cc5ec40859617e7",
+ "a2e24b21c1d3661412e00411d719210c", "7adc2b60d7d62df1d07e3e4458a46dc2",
+ "e71c1b2f9ccb1af0868c3869dc296506", "3e33e087c7e6f724528abbc658a1b631",
+ "19b80d80f6b83eedac4bab6226865ae1", "7d9293641c4ed3b21c14964ec785cfb9",
+ "5dd0fb9700f30c25bf7b65367c8f098d", "f96b55ec2d012807c972ef4731acd73d",
+ "5fc70808c3fa5b3c511926b434bfba66", "768c3ce37acfcd4e5ba05152e5710bc9",
+ "1271a52682566ebfc01d5c239177ffd4", "52d4fc11a7507695b2548e0424be50ab",
+ "729e7d421aaaf74daa27b0ce1ca0a305", "92d2ff4a9a679cdf0ff765a2d30bced1",
+ "d160ec6f1bd864eb2ac8fabf5af7fedd", "ad323dbcb4a651e96bd5c81bc185385d",
+ "937c1b7106a2e6aef0adf2c858b4df18", "0f9ad42d1c48970f8462921ac79849ee",
+ "32ed1e1a16ddbf816f81caca7cb56c93", "e91aa6389d8255b7744aaa875ba2ceec",
+ "88f9dedf6d565b2f60b511e389cf366a", "d0428fd42ca311cd3680ff4670d4f047",
+ "b9c7eeb7c9733f0220587643952602cb", "65adf32a5e03d161a411815179078ba3",
+ "4984a4e9a5bdf732c071d5b60029daf4", "b9b65a2a9f04b59766d305221e4cda5a",
+ "7b2d372fe33d6db1fcf75820b7523ed5", "9a07593316707f8e59fe09c7647ade15",
+ "33e75e0d2aa73e3410095c2f98c27a14", "f9ddb33b16431ff9cf6ae96dd4acc792",
+ "2df1a8655b2ef23f642b11b76b20f557", "9faba399ccf555c25a33c336cdd54d94",
+ "c94404e263c2dae2e955ead645348c08", "3d16d4be87cd4467c3f7be17287940c8",
+ "99d0fdae81d61680c7a5b1df38dc98fc", "a23b402d699a00c5c349b17e77f73552",
+ "c6f76c81c4050939a6bd5d30ca00b307", "bc3d035bd6e8f55497bfc6d1f81fc8be",
+ "99b10db073e13b49bd90655f7516383b", "ddfd0e434efe076e2706c5669c788566",
+ "e1d836f814e6eca80ef530f8676e0599", "ed3e4c64e9fd1006e0016e460970a423",
+ "0282542e21fa0dea0bf48ec0a2d25b2d", "7482eb8a7bf1417a61c21d82bc7c95f9",
+ "e98e9bb3d5edf7b943d0bbf1eec9bef6", "ad4d313beecf609ff3a7d30da3e54a1d",
+ "b98f8db9fa62fb73d26415f6fa31b330", "0591b3c34bf4750f20a74eee165a54bd",
+ "3054b56fec6968255f21d40f80f5121c", "59ecf60cbb8408e042816e73446fa79c",
+ "8fa8c996209a1ddb8a00c14ca19953f8", "e20d2462bc43a1a1bfbc5efe7a905666",
+ "b5065e40d5d103e21daabcf4d5fea805", "b65aba0f8e307ef08951f1abdb7c8f62",
+ "5fbec6e57c1c651bd7be69fccb0b39a6", "9dfc362f7212d086418b0def54a7c76c",
+ "6644928e9aaac5e5d64f4a2c437c778a", "1bf63c7539ea32489bec222d5bc5305f",
+ "755ec607a5edf116d188353a96a025c3", "bdc4cc354c4f57c38d3be3dbc9380e2d",
+ "7851752b4ae36793ab6f03cd91e7ba6f", "99b9834ea2f6ea8d9168c5c1ba7fe790",
+ "75a155c83b618b28d48f5f343cdfef62", "38821c97e04d2294766699a6846fefaf",
+ "14be7f588461273862c9d9b83d2f6f0a", "8c38ce521671f0eee7e6f6349ef4f981",
+ "043347de994f2fe68c08e7c06a7f6735", "cda15ea2caccbdd8a7342a6144278578",
+ "244d586e88c9d6a9a59059a82c3b8e57", "3712928dd0dd77f027370f22d61366a0",
+ "e4f1cd4785fc331ad6e3100da4a934f3", "3181459434921b5b15b64cfd2ee734c4",
+ "2d588831e98c7178c5370421a6f2fc60", "135cf6a67fc1b51dbcf9fcddb3ae1237",
+ "d701da4e1a890a37bb0e9af4a2f0b048", "02138b5a4882181f248945c3a8262050",
+ "7fbd4d06965b1d152d6c037b0302f307", "7917a20573da241868689ed49c0d5972",
+ "ffdd4257d91fe00e61de4d2668f1ee07", "72999b6d3bf1ee189e9269a27105991f",
+ "1b63d7f25388c9af4adac60d46b7a8ca", "e3ce0977224197ade58aa979f3206d68",
+ "73178ffd388b46891fc4a0440686b554", "f1f99faf52cea98c825470c6edd1d973",
+ "e6fae5d5682862ec3377b714b6b69825", "a4f96cca8da155204b0cc4258b068d3c",
+ "75c7674c2356325dcb14c222266c46f8", "932b23521c9d9d06096879a665a14e28",
+ "8ed48a84a99b4a5bf2ec8a7a2c1f1c79", "4f6f0214857a92ad92eca1c33a762424",
+ "34865190c3e91200a0609a6e770ebc5c", "e793f1f2e46876b1e417da5d59475fda",
+ "e83cd9a228941a152f6878aa939e1290", "d6f5cd74ba386bd98282e1fcb0528dbd",
+ "131b55ec66ffe76f9088f7b35d38c0dd", "2d0ae8ee059cbd8c7816e3c862efdf37",
+ "65baadd2cb85ffbc6480bf8c1f128d1a", "2b8e8af333c464b4213bbd9185a9b751",
+ "951fd5faed77a1ae9bf5ef8f30bd65c3", "41d38d40dfe9da2b9ff2146711bf6ab5",
+ "7430bde28aed5a9429db54ea663a5e26", "46576d59a13756c494793ad4b3a663e5",
+ "21802d0db30caa44cbdba2ac84cc49b5", "591cad82ae106d9e9670acd5b60e4548",
+ "c0484c58c6c009939e7f3ec0c1aa8e2d", "6405c55d0a1830cfdd37950bfd65fd6f",
+ "3bd74c067d2ba027fc004e9bf62254db", "6e920e6dbdbe55a97ff2bf3dfb38a3e0",
+ "e2ed20f89da293516b14be766a624299", "0a613ee53ec38cad995faa17a24fcb8f",
+ "0de937145c030d766c3f9fff09d7e39c", "4a560325b804fcb6643866e971ade8e8",
+ "be82c41d3a0f8bd4032c3e5e45b453da", "b27219f02db167bf5a416831b908b031",
+ "7cf5437e25d362bc373dd53d8fd78186", "39c801e28cc08150c2016083113d1a03",
+ "785a21219d9c42a7c5bd417f365535a3", "008c79298a87837bcb504c4dc39ca628",
+ "af24d1d6f4d3ee94f2af52471a64ca1f", "cd82218aae9815c106336aec7ce18833",
+ "9f405c66d4ce7533213c4ca82feaf252", "7ceda4ea6ddeccd04dbf6d3237fe956a",
+ "ae21b52869b85a64fa4e3a85a2a8bb8d", "a004927cdbf48e0dafcccfb6066cdd0c",
+ "949337a963a8a5c0f46cf774b078a7cd", "24f58b8db17d02f66d04d22ca6c5e026",
+ "2b1315a2e7c5d5309a7621651e741616", "5b317ef820e6c8e7ea7a7d7022e8349d",
+ "debd504650d35d9deca4c2461094949f", "19d0ca33e5b3a0afff1f39f0f42238e0",
+ "df1c6c7582bfa5ceb147a8dd253cfa43", "176647077c5e2d985b3807134aac118f",
+ "dd2850172602688eaaa768f705c1ba67", "6ba1a3929ae9725fc688b8189b16314f",
+ "639189abb754dfa6be3c813ee8342954", "d5d1b8bff370f280fba13827d6bdf0fb",
+ "4b0ad4ea387a952724cab42730f712d2", "8c9c1f09946b61315e9a45c7e39f1992",
+ "50ef75c2b7a17f972586ce053eb62d24", "d5922dd01d8d02ca00ab9648a3db343f",
+ "091f517b18f4438ea9c581b7471f2fc0", "fede855bfb936caaa8fb4a434adac1d3",
+ "081b612f810f38c5ff6dc1cd03bf2eb6", "bd10e764eaf7d7e0ec89de96423d0afe",
+ "3e64cb1355e05b0a4b0237fae3f33bb2", "7cb92e0ecc0dd06d0a5d248efba48630",
+ "ec875f2e155a2e124ef52bf35e9a876c", "15529c83eae41bfa804f2c386f480e90",
+ "ee0e59567874155fb54de63fc901ded7", "4ad160b0d0f5166f9cddf7235725406e",
+ "176b64b3883c33e2aa251159983ccaa1", "d9cca01946d2a47c0114b1f49e4d688f",
+ "73d706a13afa279d9c716b3ba3a2ed68", "dea5a7f010d2f1385fe2b7d1d36aafb0",
+ "b5432fbc22d2f96c1230cc33178da09e", "8b0e7399ce98b68de4048411ab649468",
+ "3d52c986a5a5852a4620fbb38259a109", "eb61882738fefdd105094d4c104cf8b0",
+ "24fbc0d3ee28e937cfa1a3fbbc4e8214", "c69eb0687e477c27ac0d4c5fe54bbe8b",
+ "00a4f498f05b2b348252927ecc82c8a3", "c76471a61250be52e8d5933e582b1e19",
+ "22ebb8812dd795fdc14f20a7f9f89844", "f7c7d5c04bc234545726f4b116b623ec",
+ "9fc323d6619af0101edfacb4e9c2b647", "902d7888215d6aac1cf41f1fb6a916d8",
+ "5817d80a0504a5b08627502aeece4f38", "a1afa4b4065c143bc4857e364cec7f3d",
+ "506d5a6ff434411ea893bb2dc021aa25", "31cd3ca39015ccee1e217e1c83fff2a0",
+ "eb1ed4ef292c7d8fead1f113c9fd998f", "35f3abf3a056b778e3d7885f8df6c07a",
+ "299d71ee557382f5e64f26f1a8e4e156", "12f8c591a4e257bcc26b385424cd8d47",
+ "0b273b03d817af587c8fb23de71f346d", "1d7592fe89c661e9f61d215d235aa2ee",
+ "331dc544956ee14064ab432c85d52828", "a0a4ccbe1c442717ad40b7d40ed81a40",
+ "45009d915bf1d4ab855b5b670d314839", "641dfe93841aaa18888cebb17b8566eb",
+ "2b177c880ce0c2b4e891abc1dc23dfc2", "23984491f7d6c206fb8babafc9aacfdb",
+ "5841b93edb22c702035e31b26c58a728", "9852506766cb47f48783640d14753089",
+ "8a43698d32f63b1e7191482e4b274fc3", "7bdef02623beae507a651ad398422876",
+ "b105138645ad27657a08a3a8e8871a7e", "913e40ebbf1b983ca4956b85364b9459",
+ "5776f97b4f0cfa435a99d5d90822922d", "a0ae92a24c2b20039d996ee2a7d8b107",
+ "a925cc792412e2a7abe89367c9fe28b1", "778183eab5c9e0ee559d828d8347a21c",
+ "c4b4777355a4c8e8858faec37ba23eec", "4cdd41c3648e8d05c3e8f58d08385f8b",
+ "7c1246737874f984feb1b5827a1f95db", "c75d766ff5af8db39d400962d5aba0b4",
+ "964f010f5aa6748461ca5573b013091d", "b003f3eab3b118e5a8a85c1873b3bb55"};
+
+TEST(WedgePredictionMaskTest, GenerateWedgeMask) {
+ WedgeMaskArray wedge_masks;
+ ASSERT_TRUE(GenerateWedgeMask(&wedge_masks));
+
+ // Check wedge masks.
+ int block_size_index = 0;
+ int index = 0;
+ for (int block_size = kBlock8x8; block_size < kMaxBlockSizes; ++block_size) {
+ const int width = kBlockWidthPixels[block_size];
+ const int height = kBlockHeightPixels[block_size];
+ if (width < 8 || height < 8 || width > 32 || height > 32) continue;
+
+ for (int flip_sign = 0; flip_sign <= 1; ++flip_sign) {
+ for (int direction = 0; direction < kWedgeDirectionTypes; ++direction) {
+ uint8_t* const block_wedge_mask =
+ wedge_masks[block_size_index][flip_sign][direction][0];
+ const std::string digest =
+ test_utils::GetMd5Sum(block_wedge_mask, width * height);
+ EXPECT_STREQ(digest.c_str(), kExpectedWedgeMask[index]);
+ index++;
+ }
+ }
+ block_size_index++;
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \
+ LIBGAV1_MAX_BITDEPTH != 12
+#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12
+#endif
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
+// Format the kDcLookup and kAcLookup arrays manually for easier comparison
+// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
+
+// clang-format off
+constexpr int16_t kDcLookup[][256] = {
+ // Lookup table for 8 bit.
+ {
+ 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16,
+ 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26,
+ 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
+ 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47,
+ 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57,
+ 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66,
+ 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+ 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+ 87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104,
+ 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121,
+ 123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146,
+ 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174,
+ 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208,
+ 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292,
+ 296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344,
+ 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406,
+ 411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482,
+ 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590,
+ 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775,
+ 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+ 1184, 1232, 1282, 1336
+ },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Lookup table for 10 bit.
+ {
+ 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34,
+ 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75,
+ 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120,
+ 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166,
+ 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212,
+ 215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255,
+ 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297,
+ 300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337,
+ 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412,
+ 418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484,
+ 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584,
+ 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698,
+ 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+ 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988,
+ 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170,
+ 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379,
+ 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624,
+ 1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929,
+ 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363,
+ 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+ 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559,
+ 4737, 4929, 5130, 5347
+ },
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Lookup table for 12 bit.
+ {
+ 4, 12, 18, 25, 33, 41, 50, 60,
+ 70, 80, 91, 103, 115, 127, 140, 153,
+ 166, 180, 194, 208, 222, 237, 251, 266,
+ 281, 296, 312, 327, 343, 358, 374, 390,
+ 405, 421, 437, 453, 469, 484, 500, 516,
+ 532, 548, 564, 580, 596, 611, 627, 643,
+ 659, 674, 690, 706, 721, 737, 752, 768,
+ 783, 798, 814, 829, 844, 859, 874, 889,
+ 904, 919, 934, 949, 964, 978, 993, 1008,
+ 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122,
+ 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+ 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342,
+ 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544,
+ 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741,
+ 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933,
+ 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199,
+ 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467,
+ 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788,
+ 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127,
+ 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517,
+ 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951,
+ 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+ 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942,
+ 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517,
+ 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149,
+ 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867,
+ 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715,
+ 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788,
+ 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245,
+ 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+ 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+ 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+};
+
+constexpr int16_t kAcLookup[][256] = {
+ // Lookup table for 8 bit.
+ {
+ 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+ 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+ 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185,
+ 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+ 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+ 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347,
+ 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432,
+ 440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540,
+ 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676,
+ 689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848,
+ 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066,
+ 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+ 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692,
+ 1725, 1759, 1793, 1828
+ },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Lookup table for 10 bit.
+ {
+ 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37,
+ 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83,
+ 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136,
+ 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190,
+ 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244,
+ 249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297,
+ 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349,
+ 354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401,
+ 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498,
+ 506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596,
+ 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737,
+ 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905,
+ 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+ 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+ 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727,
+ 1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159,
+ 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703,
+ 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391,
+ 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264,
+ 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372,
+ 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768,
+ 6900, 7036, 7172, 7312
+ },
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Lookup table for 12 bit.
+ {
+ 4, 13, 19, 27, 35, 44, 54, 64,
+ 75, 87, 99, 112, 126, 139, 154, 168,
+ 183, 199, 214, 230, 247, 263, 280, 297,
+ 314, 331, 349, 366, 384, 402, 420, 438,
+ 456, 475, 493, 511, 530, 548, 567, 586,
+ 604, 623, 642, 660, 679, 698, 716, 735,
+ 753, 772, 791, 809, 828, 846, 865, 884,
+ 902, 920, 939, 957, 976, 994, 1012, 1030,
+ 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175,
+ 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317,
+ 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+ 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595,
+ 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856,
+ 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118,
+ 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378,
+ 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750,
+ 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137,
+ 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619,
+ 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149,
+ 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791,
+ 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544,
+ 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+ 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435,
+ 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635,
+ 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028,
+ 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+ 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+ 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+ 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+ 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+ 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+ 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+};
+// clang-format on
+
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+ int src_height) {
+ const int dst_width = src_height;
+ const int dst_height = src_width;
+ Array2DView<const uint8_t> source(src_height, src_width, src);
+ Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+ for (int y = 0; y < dst_height; ++y) {
+ for (int x = 0; x < dst_width; ++x) {
+ dest[y][x] = source[x][y];
+ }
+ }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+ Array2DView<uint8_t> dest(size, size, dst);
+ int k = 0;
+ for (int y = 0; y < size; ++y) {
+ for (int x = 0; x <= y; ++x) {
+ dest[y][x] = dest[x][y] = src[k++];
+ }
+ }
+}
+
+} // namespace
+
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+ for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+ for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+ ++plane_type) {
+ auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+ // Notes about how these matrices are populated:
+ // * For square transforms, we store only the lower left triangle (it is
+ // symmetric about the main diagonal. So when populating the matrix, we
+ // will have to fill in the upper right triangle.
+ // * For rectangular transforms, the matrices are transposes when the
+ // width and height are reversed. So when populating we populate it with
+ // memcpy when w < h and populate it by transposing when w > h.
+ // * There is a special case for 16x16 where the matrix is the same as
+ // 32x32 with some offsets.
+ // * We use the "adjusted transform size" when using these matrices, so we
+ // won't have to populate them for transform sizes with one of the
+ // dimensions equal to 64.
+ for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+ if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+ continue;
+ }
+ const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+ if (!quantizer_matrix[tx_size].Resize(size)) {
+ return false;
+ }
+ }
+#define QUANTIZER_MEMCPY(W, H) \
+ memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H) \
+ Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE) \
+ FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+ kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+ QUANTIZER_FILL_UPPER_TRIANGLE(4); // 4x4
+ QUANTIZER_MEMCPY(4, 8); // 4x8
+ QUANTIZER_MEMCPY(4, 16); // 4x16
+ QUANTIZER_TRANSPOSE(8, 4); // 8x4
+ QUANTIZER_FILL_UPPER_TRIANGLE(8); // 8x8
+ QUANTIZER_MEMCPY(8, 16); // 8x16
+ QUANTIZER_MEMCPY(8, 32); // 8x32
+ QUANTIZER_TRANSPOSE(16, 4); // 16x4
+ QUANTIZER_TRANSPOSE(16, 8); // 16x8
+ QUANTIZER_MEMCPY(16, 32); // 16x32
+ QUANTIZER_TRANSPOSE(32, 8); // 32x8
+ QUANTIZER_TRANSPOSE(32, 16); // 32x16
+ QUANTIZER_FILL_UPPER_TRIANGLE(32); // 32x32
+ // 16x16.
+ Array2DView<uint8_t> dst16x16(
+ 16, 16, quantizer_matrix[kTransformSize16x16].get());
+ Array2DView<const uint8_t> src32x32(
+ 32, 32, quantizer_matrix[kTransformSize32x32].get());
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+ }
+ }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+ }
+ }
+ return true;
+}
+
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
+ if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
+ const int segment_qindex =
+ base_qindex +
+ segmentation.feature_data[index][kSegmentFeatureQuantizer];
+ return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer);
+ }
+ return base_qindex;
+}
+
+Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params)
+ : params_(*params) {
+ assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH);
+ const int index = BitdepthToArrayIndex(bitdepth);
+ dc_lookup_ = kDcLookup[index];
+ ac_lookup_ = kAcLookup[index];
+}
+
+int Quantizer::GetDcValue(Plane plane, int qindex) const {
+ return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer,
+ kMaxQuantizer)];
+}
+
+int Quantizer::GetAcValue(Plane plane, int qindex) const {
+ return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer,
+ kMaxQuantizer)];
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_QUANTIZER_H_
+#define LIBGAV1_SRC_QUANTIZER_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+using QuantizerMatrix = std::array<
+ std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+ kNumPlaneTypes>,
+ kNumQuantizerLevelsForQuantizerMatrix>;
+
+// Implements the dequantization functions of Section 7.12.2.
+class Quantizer {
+ public:
+ Quantizer(int bitdepth, const QuantizerParameters* params);
+
+ // Returns the quantizer value for the dc coefficient for the given plane.
+ // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+ // the |base_qindex| argument, and pass the return value as the |qindex|
+ // argument to this method.
+ int GetDcValue(Plane plane, int qindex) const;
+
+ // Returns the quantizer value for the ac coefficient for the given plane.
+ // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+ // the |base_qindex| argument, and pass the return value as the |qindex|
+ // argument to this method.
+ int GetAcValue(Plane plane, int qindex) const;
+
+ private:
+ const QuantizerParameters& params_;
+ const int16_t* dc_lookup_;
+ const int16_t* ac_lookup_;
+};
+
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
+// Get the quantizer index for the |index|th segment.
+//
+// This function has two use cases. What should be passed as the |base_qindex|
+// argument depends on the use case.
+// 1. While parsing the uncompressed header or transform type, pass
+// Quantizer::base_index.
+// Note: In this use case, the caller only cares about whether the return
+// value is zero.
+// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or
+// Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_.
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_QUANTIZER_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the quantizer table
+// definitions from the quantizer functions.
+
+constexpr uint8_t kQuantizerMatrix4x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+ {{32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84,
+ 91, 49, 71, 103, 110, 65, 84, 125, 128, 80, 97,
+ 142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+ {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73,
+ 54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+ {{32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77,
+ 88, 46, 67, 93, 105, 60, 79, 112, 122, 75, 92,
+ 130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+ {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+ 52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+ {{32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68,
+ 85, 44, 61, 85, 101, 54, 69, 98, 117, 72, 84,
+ 118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+ {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+ 50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+ {{32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65,
+ 82, 41, 53, 78, 97, 51, 61, 92, 111, 65, 73,
+ 108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+ {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+ 49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+ {{32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87,
+ 48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+ {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+ 47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+ {{32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84,
+ 45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+ {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+ 46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+ {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68,
+ 41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+ {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+ 45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+ {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+ 38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+ {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+ 46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+ {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+ 35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+ {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+ 47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+ {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+ 34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+ {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+ 43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+ 33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+ {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+ 40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+ {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+ 32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+ 37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+ {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+ 34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+ {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+ {{31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34,
+ 48, 73, 83, 34, 54, 78, 89, 41, 63, 90, 95, 45, 67,
+ 96, 102, 54, 75, 110, 111, 60, 79, 118, 123, 72, 90, 133,
+ 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, 140, 173,
+ 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+ {31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60,
+ 46, 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74,
+ 52, 64, 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96,
+ 63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+ {{31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33,
+ 44, 66, 81, 34, 54, 74, 86, 37, 58, 79, 92, 44, 66,
+ 90, 98, 49, 71, 99, 107, 56, 77, 107, 117, 65, 84, 119,
+ 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, 163,
+ 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183},
+ {31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59,
+ 46, 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73,
+ 51, 63, 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93,
+ 62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+ {{31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33,
+ 40, 58, 78, 34, 47, 65, 83, 37, 54, 73, 89, 41, 58,
+ 79, 94, 46, 62, 86, 102, 53, 68, 97, 112, 60, 73, 105,
+ 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, 154,
+ 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170},
+ {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+ 44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+ 49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+ 60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+ {{31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33,
+ 38, 56, 76, 34, 42, 61, 81, 34, 48, 66, 85, 39, 51,
+ 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, 54, 63, 95,
+ 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+ 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159},
+ {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+ 43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+ 48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+ 58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+ {{31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32,
+ 37, 49, 71, 33, 41, 53, 74, 34, 48, 60, 80, 37, 50,
+ 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, 49, 60, 82,
+ 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130,
+ 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148},
+ {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+ 42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+ 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+ 56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+ {{31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71,
+ 33, 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84,
+ 41, 53, 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110,
+ 60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+ {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+ 40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+ 45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+ 52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+ {{31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59,
+ 32, 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71,
+ 39, 46, 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92,
+ 54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+ {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+ 40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+ 46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+ 50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+ {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+ 32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+ 37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+ 50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+ {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+ 37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+ 46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+ 48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+ {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+ 32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+ 35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+ 45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+ {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+ 35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+ 47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+ 47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+ {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+ 32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+ 34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+ 40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+ {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+ 34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+ 42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+ 47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+ 32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+ 32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+ 35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+ {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+ 31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+ 38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+ 48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+ {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+ 32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+ 34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+ 31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+ 36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+ 42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+ 32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+ {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+ 31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+ 33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+ 38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+ {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+ 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+ 33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59,
+ 78, 86, 93, 32, 34, 36, 50, 59, 77, 82, 89, 34, 37,
+ 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, 68, 84, 86,
+ 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76,
+ 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65,
+ 58, 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141,
+ 135, 135, 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88,
+ 106, 130, 148, 162, 159, 97, 86, 94, 107, 128, 157, 167, 171,
+ 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+ 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+ {32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66,
+ 33, 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62,
+ 49, 48, 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66,
+ 50, 46, 54, 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75,
+ 57, 50, 56, 70, 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84,
+ 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94,
+ 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, 83, 92, 101, 104,
+ 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60,
+ 72, 84, 90, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, 68, 78, 83,
+ 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69,
+ 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61,
+ 56, 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125,
+ 129, 129, 79, 70, 79, 95, 118, 133, 142, 138, 86, 76, 84,
+ 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, 148, 157, 161,
+ 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, 129,
+ 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188},
+ {32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65,
+ 33, 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65,
+ 50, 46, 54, 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82,
+ 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91,
+ 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, 81, 89, 98, 101,
+ 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54,
+ 73, 81, 88, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, 59, 75, 81,
+ 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63,
+ 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57,
+ 53, 63, 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118,
+ 123, 122, 71, 64, 73, 84, 102, 125, 135, 131, 81, 72, 80,
+ 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, 151,
+ 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121,
+ 141, 160, 169, 103, 94, 92, 103, 119, 137, 158, 175},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65,
+ 33, 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60,
+ 44, 46, 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64,
+ 49, 45, 53, 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72,
+ 54, 49, 55, 62, 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81,
+ 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89,
+ 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, 79, 87, 96, 98,
+ 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76,
+ 85, 31, 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58,
+ 71, 79, 34, 35, 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60,
+ 68, 80, 87, 41, 40, 49, 60, 67, 76, 88, 93, 47, 44, 53, 66,
+ 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, 108, 58, 54, 61,
+ 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, 124, 74, 67,
+ 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, 142, 87,
+ 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, 157,
+ 97, 88, 86, 97, 111, 128, 147, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+ 33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+ 42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+ 48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+ 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+ 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+ 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74,
+ 82, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58,
+ 69, 77, 33, 34, 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58,
+ 68, 78, 84, 39, 38, 44, 54, 63, 73, 84, 89, 44, 41, 46, 59,
+ 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, 53, 49, 53,
+ 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, 60,
+ 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+ 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+ 91, 82, 80, 90, 103, 119, 137, 151},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+ 31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+ 41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+ 48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+ 50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+ 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+ 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69,
+ 31, 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99,
+ 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, 79, 92, 102, 112,
+ 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+ 30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+ 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+ 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 34, 34, 37, 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76,
+ 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85,
+ 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+ 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+ 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+ 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+ 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+ 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+ 33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+ 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+ 34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+ 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+ 42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+ 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82,
+ 88, 94, 31, 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49,
+ 59, 78, 84, 90, 32, 34, 36, 50, 59, 77, 82, 89, 32, 35,
+ 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88,
+ 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84,
+ 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71,
+ 79, 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46,
+ 56, 76, 85, 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107,
+ 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, 89, 101, 120,
+ 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, 97,
+ 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71,
+ 80, 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146,
+ 88, 77, 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148,
+ 162, 159, 94, 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107,
+ 128, 157, 167, 171, 100, 89, 97, 111, 127, 152, 173, 182, 103, 93,
+ 98, 114, 131, 150, 174, 186, 107, 96, 100, 117, 136, 155, 177, 191,
+ 110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+ 185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+ 136, 156, 179, 204},
+ {32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67,
+ 30, 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64,
+ 33, 43, 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62,
+ 42, 47, 50, 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61,
+ 49, 48, 53, 54, 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64,
+ 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67,
+ 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, 71, 77, 73, 71,
+ 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, 79, 76,
+ 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86,
+ 64, 56, 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91,
+ 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97,
+ 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74, 82, 93, 102, 102,
+ 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+ 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+ 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99, 108}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75,
+ 86, 91, 31, 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44,
+ 59, 71, 82, 87, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, 78, 85,
+ 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78,
+ 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60,
+ 73, 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45,
+ 56, 69, 84, 95, 101, 101, 49, 47, 57, 71, 86, 97, 103, 102,
+ 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, 98, 110,
+ 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+ 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68,
+ 76, 92, 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138,
+ 82, 73, 81, 97, 121, 136, 145, 144, 86, 76, 84, 100, 124, 140,
+ 153, 150, 89, 79, 87, 99, 124, 145, 156, 156, 92, 82, 89, 101,
+ 121, 148, 157, 161, 95, 85, 92, 105, 120, 143, 163, 171, 98, 88,
+ 93, 108, 124, 141, 163, 174, 101, 91, 94, 110, 128, 146, 166, 179,
+ 104, 94, 95, 110, 129, 151, 171, 181, 107, 97, 96, 110, 128, 149,
+ 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, 114, 104, 100, 111,
+ 127, 145, 166, 190},
+ {32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66,
+ 30, 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63,
+ 33, 41, 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62,
+ 39, 46, 48, 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63,
+ 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66,
+ 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, 68, 71, 72, 70,
+ 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, 78, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+ 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84,
+ 63, 55, 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89,
+ 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94,
+ 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72, 80, 91, 99, 100,
+ 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73, 82, 90, 99, 103,
+ 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+ 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71, 78, 87, 96, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75,
+ 83, 88, 31, 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41,
+ 53, 72, 79, 84, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, 76, 82,
+ 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75,
+ 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58,
+ 68, 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42,
+ 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, 79, 95, 99, 98,
+ 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, 103,
+ 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75,
+ 91, 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62,
+ 71, 83, 100, 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131,
+ 79, 71, 79, 90, 109, 133, 137, 136, 81, 72, 80, 91, 110, 135,
+ 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87, 77, 85, 96,
+ 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160, 92, 83,
+ 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137, 155, 168,
+ 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103, 120, 139,
+ 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, 93, 104,
+ 118, 135, 155, 176},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65,
+ 31, 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62,
+ 33, 40, 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61,
+ 37, 44, 47, 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59,
+ 44, 46, 51, 51, 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62,
+ 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65,
+ 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, 65, 70, 70, 68,
+ 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, 76, 73,
+ 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+ 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82,
+ 60, 53, 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87,
+ 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92,
+ 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70, 78, 88, 96, 97,
+ 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80, 88, 96, 100,
+ 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+ 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62,
+ 78, 86, 31, 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41,
+ 49, 59, 74, 82, 31, 33, 35, 42, 49, 59, 73, 81, 32, 33,
+ 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, 71, 79,
+ 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63,
+ 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54,
+ 60, 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40,
+ 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, 71, 79, 92, 94,
+ 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+ 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72,
+ 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58,
+ 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105, 120, 124,
+ 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100, 113,
+ 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92,
+ 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78,
+ 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157,
+ 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130,
+ 151, 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97,
+ 110, 126, 144, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+ 31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+ 33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+ 37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+ 42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+ 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+ 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+ 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+ 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+ 57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+ 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+ 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+ 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+ 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+ 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62,
+ 75, 83, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38,
+ 47, 59, 72, 79, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33,
+ 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, 69, 77,
+ 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62,
+ 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51,
+ 58, 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38,
+ 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, 65, 75, 85, 90,
+ 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+ 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65,
+ 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53,
+ 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110, 118,
+ 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106,
+ 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81,
+ 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73,
+ 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147,
+ 86, 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122,
+ 140, 152, 91, 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90,
+ 103, 117, 134, 152},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+ 31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+ 31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+ 35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+ 41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+ 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+ 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+ 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+ 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+ 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+ 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+ 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+ 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+ 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+ 82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+ 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70,
+ 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67,
+ 31, 32, 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66,
+ 32, 32, 34, 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64,
+ 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, 54, 60, 66, 74,
+ 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, 71, 79,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98,
+ 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103,
+ 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56, 66, 77, 89, 98, 108,
+ 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60, 69, 81, 94, 103, 114,
+ 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64, 73, 85, 98, 108, 119,
+ 79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+ 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+ 30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+ 33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+ 37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+ 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+ 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+ 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+ 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+ 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+ 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+ 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63,
+ 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59,
+ 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62,
+ 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, 46, 50, 56, 65,
+ 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79,
+ 44, 41, 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84,
+ 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88,
+ 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57, 64, 71, 82, 92,
+ 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68, 75, 87, 98,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+ 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+ 31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+ 30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+ 33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+ 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+ 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+ 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+ 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+ 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+ 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+ 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+ 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+ 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+ 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+ 39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+ 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+ 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+ 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+ 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+ 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+ 31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+ 31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+ 33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+ 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+ 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+ 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+ 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+ 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+ 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+ 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+ 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+ 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+ 36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+ 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+ 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+ 31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+ 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+ 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+ 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+ 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+ 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+ 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+ 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+ 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+ 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+ 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+ 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+ 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+ 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+ 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+ 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+ 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+ 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+ 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+ 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+ 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+ 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+ 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+ 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+ 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+ {{32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96,
+ 99, 102, 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85,
+ 88, 91, 94, 97, 31, 32, 33, 33, 34, 41, 49, 54, 59, 72,
+ 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, 49, 54,
+ 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42,
+ 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37,
+ 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34,
+ 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92,
+ 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+ 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87,
+ 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69, 73, 84,
+ 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75,
+ 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65,
+ 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51,
+ 56, 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49,
+ 50, 54, 60, 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111,
+ 58, 54, 54, 58, 63, 75, 87, 92, 98, 110, 116, 115, 112, 111,
+ 115, 112, 61, 57, 56, 60, 66, 77, 89, 95, 101, 114, 120, 118,
+ 119, 118, 116, 120, 65, 60, 58, 63, 68, 79, 92, 98, 105, 118,
+ 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, 84, 97, 103,
+ 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, 90,
+ 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+ 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75,
+ 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+ 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152,
+ 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155,
+ 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105, 118, 131, 137,
+ 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94, 100, 107, 123,
+ 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89, 87, 97, 100,
+ 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94, 93, 90,
+ 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+ 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+ 110, 101, 100, 97, 101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+ 193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+ 185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+ 157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+ 136, 156, 156, 178, 179, 203, 204, 217},
+ {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72,
+ 31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68,
+ 30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68,
+ 32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65,
+ 33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65,
+ 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63,
+ 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, 59, 60, 62, 63,
+ 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, 61, 61,
+ 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+ 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65,
+ 49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65,
+ 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68,
+ 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72, 72, 70, 70, 69,
+ 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73, 73, 71, 72,
+ 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74, 75, 73,
+ 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76, 77,
+ 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77,
+ 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+ 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81,
+ 64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86,
+ 64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87,
+ 67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91,
+ 68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91,
+ 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98, 99, 98, 97, 96,
+ 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, 99, 101, 98, 97,
+ 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, 102, 102, 101,
+ 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+ 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, 106, 107,
+ 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, 107,
+ 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+ 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110,
+ 77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93,
+ 96, 99, 31, 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83,
+ 86, 88, 91, 94, 31, 32, 32, 33, 35, 41, 44, 49, 60, 67,
+ 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, 44, 49,
+ 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42,
+ 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36,
+ 38, 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33,
+ 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, 78, 81, 85, 89,
+ 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+ 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84,
+ 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71, 77,
+ 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65,
+ 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63,
+ 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47,
+ 56, 66, 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46,
+ 47, 48, 57, 67, 71, 77, 86, 93, 97, 103, 103, 105, 102, 106,
+ 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103, 111, 108, 107,
+ 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105, 110, 114,
+ 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100, 107,
+ 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+ 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84,
+ 89, 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69,
+ 76, 88, 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72,
+ 70, 71, 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136,
+ 82, 75, 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144,
+ 144, 145, 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147,
+ 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130,
+ 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116,
+ 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95,
+ 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+ 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+ 91, 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+ 104, 95, 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175,
+ 181, 186, 107, 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156,
+ 173, 177, 188, 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141,
+ 147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+ 127, 145, 145, 166, 166, 189, 190, 201},
+ {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71,
+ 31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67,
+ 30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67,
+ 31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64,
+ 33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64,
+ 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62,
+ 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58, 59, 61, 62,
+ 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, 60, 60,
+ 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+ 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64,
+ 48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64,
+ 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67,
+ 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71, 68, 69, 67,
+ 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72, 72, 70, 71,
+ 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73, 74, 71,
+ 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74, 75,
+ 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75,
+ 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+ 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80,
+ 62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84,
+ 63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84,
+ 64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89,
+ 66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89,
+ 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96, 96, 94, 94,
+ 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97, 98, 96, 94,
+ 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, 100, 98,
+ 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, 99,
+ 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+ 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+ 73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+ 74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106,
+ 75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109}},
+ {{32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90,
+ 93, 96, 31, 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77,
+ 83, 86, 88, 91, 31, 32, 32, 32, 35, 38, 41, 50, 54, 60,
+ 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, 41, 49,
+ 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39,
+ 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34,
+ 37, 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33,
+ 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, 76, 78, 82, 86,
+ 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+ 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77,
+ 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63, 68,
+ 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65,
+ 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55,
+ 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43,
+ 53, 58, 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44,
+ 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99, 101, 98, 102,
+ 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104, 102,
+ 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105,
+ 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+ 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87,
+ 91, 98, 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73,
+ 79, 92, 97, 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62,
+ 71, 76, 83, 96, 100, 109, 122, 124, 127, 125, 125, 128, 71, 65,
+ 64, 63, 73, 78, 84, 97, 102, 111, 125, 127, 135, 134, 131, 129,
+ 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, 133, 135, 137, 136,
+ 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, 135, 137,
+ 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+ 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110,
+ 114, 125, 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89,
+ 99, 108, 113, 129, 135, 146, 153, 157, 160, 159, 92, 84, 83, 81,
+ 88, 90, 102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95, 87,
+ 85, 83, 88, 92, 103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+ 98, 89, 88, 85, 89, 95, 103, 108, 121, 124, 141, 144, 160, 164,
+ 169, 174, 100, 92, 91, 88, 90, 98, 103, 111, 120, 127, 139, 146,
+ 161, 165, 175, 179, 103, 94, 94, 90, 92, 101, 103, 114, 119, 131,
+ 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, 93, 104, 104, 118,
+ 118, 135, 135, 154, 155, 175, 176, 187},
+ {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69,
+ 31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66,
+ 31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66,
+ 30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63,
+ 33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63,
+ 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61,
+ 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58, 60, 61,
+ 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, 59, 59,
+ 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+ 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63,
+ 48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63,
+ 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66,
+ 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67, 68, 66,
+ 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71, 68, 70,
+ 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71, 72, 70,
+ 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73, 73,
+ 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74,
+ 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+ 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78,
+ 59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82,
+ 60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82,
+ 63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86,
+ 64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87,
+ 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93, 92, 91,
+ 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95, 93, 91,
+ 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, 97, 95,
+ 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+ 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101,
+ 70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101,
+ 71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103,
+ 72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+ 73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88,
+ 90, 93, 31, 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75,
+ 78, 83, 86, 88, 31, 32, 32, 32, 34, 35, 41, 45, 50, 58,
+ 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, 41, 44,
+ 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36,
+ 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34,
+ 36, 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33,
+ 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, 71, 76, 79, 83,
+ 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+ 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73,
+ 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59, 65,
+ 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56,
+ 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50,
+ 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41,
+ 49, 51, 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41,
+ 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92, 97, 94, 97,
+ 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98,
+ 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+ 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90,
+ 92, 103, 106, 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77,
+ 84, 92, 94, 106, 108, 111, 110, 112, 58, 54, 54, 54, 61, 63,
+ 75, 79, 87, 95, 98, 110, 112, 117, 116, 113, 63, 58, 58, 57,
+ 65, 67, 78, 83, 91, 100, 103, 116, 118, 119, 119, 121, 65, 60,
+ 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, 127, 124, 122,
+ 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, 129,
+ 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128,
+ 131, 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115,
+ 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97,
+ 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72, 79, 81,
+ 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78, 76,
+ 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89, 82,
+ 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161,
+ 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153,
+ 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136,
+ 151, 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123,
+ 128, 140, 147, 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110,
+ 110, 126, 126, 144, 144, 163, 163, 173},
+ {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+ 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+ 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+ 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+ 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+ 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+ 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+ 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+ 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+ 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+ 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+ 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+ 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+ 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+ 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+ 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+ 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+ 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+ 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+ 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+ 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+ 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+ 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+ 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+ 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+ 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+ 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+ 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+ 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+ 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+ 98, 102}},
+ {{32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81,
+ 87, 90, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63,
+ 75, 77, 83, 85, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51,
+ 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, 38, 41,
+ 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35,
+ 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34,
+ 35, 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32,
+ 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, 69, 71, 77, 80,
+ 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+ 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63,
+ 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53, 54,
+ 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54,
+ 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48,
+ 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40,
+ 44, 50, 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38,
+ 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85, 87, 90, 93,
+ 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91,
+ 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+ 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76,
+ 85, 86, 96, 98, 103, 100, 52, 48, 48, 49, 52, 59, 65, 70,
+ 78, 80, 90, 91, 101, 103, 105, 107, 53, 49, 49, 50, 53, 60,
+ 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, 58, 53, 53, 53,
+ 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, 54,
+ 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115,
+ 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119,
+ 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107,
+ 119, 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97,
+ 111, 112, 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86,
+ 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79,
+ 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, 73, 71,
+ 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+ 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151,
+ 86, 78, 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142,
+ 147, 151, 88, 81, 80, 77, 80, 86, 90, 98, 105, 112, 122, 127,
+ 140, 144, 152, 155, 91, 83, 82, 79, 80, 88, 90, 100, 103, 114,
+ 119, 130, 137, 148, 151, 155, 93, 85, 85, 81, 81, 90, 90, 102,
+ 103, 117, 117, 134, 134, 151, 152, 160},
+ {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+ 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+ 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+ 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+ 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+ 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+ 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+ 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+ 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+ 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+ 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+ 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+ 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+ 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+ 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+ 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+ 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+ 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+ 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+ 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+ 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+ 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+ 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+ 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+ 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+ 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+ 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+ 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+ 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+ 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+ 95, 98}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79,
+ 79, 87, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62,
+ 62, 75, 75, 82, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51,
+ 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, 34, 41,
+ 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34,
+ 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34,
+ 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32,
+ 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77,
+ 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+ 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58,
+ 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54,
+ 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48,
+ 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48,
+ 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38,
+ 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37,
+ 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89,
+ 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84,
+ 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+ 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71,
+ 71, 79, 79, 90, 90, 95, 48, 45, 45, 46, 46, 56, 56, 67,
+ 67, 76, 76, 85, 85, 96, 96, 102, 48, 45, 45, 46, 46, 56,
+ 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, 49, 50,
+ 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49,
+ 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
+ 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110,
+ 110, 116, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98,
+ 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92,
+ 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, 79,
+ 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73,
+ 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63,
+ 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 79, 72,
+ 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141,
+ 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133,
+ 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121,
+ 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106,
+ 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96,
+ 96, 109, 109, 124, 124, 141, 141, 149},
+ {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+ 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+ 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+ 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+ 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+ 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+ 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+ 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+ 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+ 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+ 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+ 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+ 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+ 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+ 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+ 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+ 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+ 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+ 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+ 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+ 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+ 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+ 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+ 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+ 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+ 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+ 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+ 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+ 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+ 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+ 92, 95}},
+ {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75,
+ 31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73,
+ 31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72,
+ 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71,
+ 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, 57, 59, 65, 71,
+ 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, 65, 70,
+ 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+ 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59, 65, 67, 73, 77,
+ 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66, 68, 74, 78,
+ 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71, 77, 81,
+ 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79, 84,
+ 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86,
+ 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+ 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103,
+ 53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103,
+ 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108,
+ 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110,
+ 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114,
+ 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118,
+ 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120,
+ 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125,
+ 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+ {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+ 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+ 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+ 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+ 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+ 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+ 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+ 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+ 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+ 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+ 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+ 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+ 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+ 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+ 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+ 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+ 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+ 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+ 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+ 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+ 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+ 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+ 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+ 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+ 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+ 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+ 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+ 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+ 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+ 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+ 86, 89}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62,
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62,
+ 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57, 63, 63,
+ 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59, 65, 65,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72,
+ 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+ 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84,
+ 48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85,
+ 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82, 88, 88,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+ 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98,
+ 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+ {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+ 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+ 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+ 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+ 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+ 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+ 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+ 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+ 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+ 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+ 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+ 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+ 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+ 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+ 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+ 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+ 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+ 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+ 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+ 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+ 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+ 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+ 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+ 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+ 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+ 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+ 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+ 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+ 78, 78}},
+ {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+ 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+ 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+ 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+ 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+ 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+ 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+ 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+ 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+ 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+ 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+ 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+ 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+ 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+ 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+ 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+ 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+ 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+ 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+ 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+ 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+ 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+ 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+ 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+ 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+ 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+ 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+ 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+ 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+ 87, 92},
+ {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+ 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+ 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+ 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+ 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+ 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+ 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+ 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+ 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+ 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+ 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+ 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+ 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+ 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+ 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+ 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+ 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+ 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+ 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+ 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+ 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+ 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+ 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+ 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+ 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+ 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+ 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+ 68, 71}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+ 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+ 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+ 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+ 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+ 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+ 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+ 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+ 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+ 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+ 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+ 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+ 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+ 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+ 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+ 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+ 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+ 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+ 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+ 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+ 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+ 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+ 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+ 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+ 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+ 73, 79},
+ {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+ 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+ 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+ 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+ 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+ 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+ 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+ 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+ 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+ 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+ 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+ 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+ 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+ 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+ 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+ 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+ 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+ 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+ 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+ 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+ 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+ 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+ 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+ 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+ 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+ 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+ 62, 65}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+ 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+ 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+ 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+ 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+ 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+ 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+ 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+ 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+ 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+ 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+ 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+ 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+ 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+ 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+ 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+ 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+ 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+ 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+ 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+ 63, 63},
+ {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+ 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+ 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+ 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+ 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+ 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+ 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+ 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+ 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+ 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+ 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+ 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+ 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+ 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+ 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+ 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+ 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+ 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+ 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+ 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+ 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+ 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+ 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+ 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+ 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+ 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+ 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+ 58, 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+ 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+ 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+ 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+ 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+ 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+ 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+ 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+ 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+ 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+ 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+ 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+ 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+ 49, 49},
+ {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+ 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+ 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+ 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+ 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+ 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+ 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+ 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+ 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+ 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+ 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+ 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+ 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+ 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+ 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+ 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+ 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+ 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+ 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+ 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+ 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+ 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+ 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+ 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+ 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+ 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+ 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+ 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+ 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+ 53, 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+ 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+ 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+ 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+ 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+ 38, 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+ 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+ 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+ 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+ 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+ 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+ 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+ 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+ 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+ 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+ 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+ 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+ 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+ 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+ 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+ 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+ 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+ 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+ 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+ 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+ 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+ 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+ 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+ 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+ 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+ 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 48, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+ 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+ 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+ 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+ 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+ 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+ 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+ 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+ 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+ 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+ 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+ 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+ 42, 44}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32}}};
+constexpr uint8_t
+ kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+ [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+ {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+ {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+ {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+ {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+ {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+ {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+ {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+ {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+ {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+ {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+ {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+ {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+ {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+ {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+ {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+ {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+ {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+ {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+ {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+ {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+ {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+ {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+ {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+ {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+ {{32, 32, 35, 38, 40, 54, 51, 49, 65, 82, 68, 63,
+ 78, 97, 117, 84, 76, 91, 111, 134, 152, 95, 89, 98,
+ 113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+ {31, 38, 47, 47, 46, 54, 50, 47, 57, 66, 57, 52,
+ 61, 72, 82, 63, 57, 66, 77, 88, 96, 67, 62, 67,
+ 75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+ {{32, 32, 35, 37, 39, 51, 47, 46, 60, 73, 62, 58,
+ 71, 87, 105, 78, 72, 84, 100, 121, 140, 90, 84, 93,
+ 106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+ {31, 38, 47, 47, 47, 53, 48, 46, 55, 62, 54, 50,
+ 58, 67, 76, 61, 55, 63, 72, 83, 91, 66, 61, 65,
+ 73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+ {{32, 32, 34, 35, 37, 48, 46, 45, 56, 70, 57, 54,
+ 64, 80, 93, 76, 70, 79, 96, 111, 134, 85, 79, 87,
+ 100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+ {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+ 55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+ 71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+ {{32, 32, 33, 35, 36, 46, 42, 42, 52, 63, 53, 51,
+ 60, 73, 86, 68, 64, 72, 84, 100, 117, 78, 74, 80,
+ 92, 109, 128, 140, 90, 84, 87, 98, 114, 133, 155, 168},
+ {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+ 54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+ 68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+ {{32, 32, 33, 34, 35, 39, 39, 40, 46, 56, 50, 48,
+ 53, 65, 78, 62, 59, 63, 75, 90, 105, 76, 71, 74,
+ 86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+ 50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+ 65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54, 46, 45,
+ 51, 61, 71, 56, 54, 58, 69, 80, 92, 68, 64, 68,
+ 78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+ 50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+ 61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+ {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42, 41,
+ 43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56, 56,
+ 66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+ {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+ 45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+ 56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+ {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+ 40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+ 56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+ {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+ 46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+ 52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+ {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+ 37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+ 51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+ {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+ 47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+ 50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+ {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+ 35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+ 42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+ {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+ 45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+ 45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+ {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+ 34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+ 38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+ {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+ 42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+ 47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+ 32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+ 35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+ {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+ 39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+ 45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+ 32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+ {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+ 35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+ 40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33,
+ 33, 32, 32, 32, 33, 34, 35, 34, 34, 33, 34, 35, 37, 39,
+ 35, 34, 34, 35, 36, 37, 41, 43, 36, 35, 34, 35, 36, 38,
+ 42, 45, 48, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 44,
+ 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 46, 44, 42, 43,
+ 44, 44, 49, 52, 55, 59, 65, 67, 48, 46, 44, 45, 45, 46,
+ 51, 53, 57, 61, 67, 69, 71, 54, 51, 49, 49, 50, 49, 54,
+ 57, 60, 65, 71, 74, 76, 82, 59, 56, 54, 54, 54, 53, 58,
+ 61, 64, 69, 75, 78, 80, 87, 92, 62, 59, 56, 56, 56, 55,
+ 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 65, 62, 59, 59,
+ 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 71,
+ 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103,
+ 107, 111, 117, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90,
+ 93, 96, 104, 110, 114, 118, 125, 134, 81, 77, 73, 73, 72, 70,
+ 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137,
+ 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106,
+ 113, 117, 121, 128, 137, 138, 140, 88, 84, 80, 79, 78, 76, 80,
+ 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147,
+ 152, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107,
+ 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94, 89, 86, 85,
+ 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136,
+ 139, 146, 156, 158, 161, 166, 97, 92, 90, 88, 86, 85, 84, 89,
+ 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+ 166, 168, 174, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101,
+ 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+ 183, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111,
+ 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+ 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120,
+ 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+ 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120,
+ 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+ 210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+ 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+ 210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+ 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+ 197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+ 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+ 177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+ {32, 31, 31, 30, 31, 32, 32, 33, 33, 35, 33, 34, 35, 37,
+ 39, 36, 38, 40, 41, 43, 47, 41, 42, 42, 43, 45, 47, 48,
+ 45, 45, 44, 45, 46, 47, 49, 50, 49, 47, 46, 47, 47, 48,
+ 50, 51, 53, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 49,
+ 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 50, 47, 45, 46,
+ 46, 46, 49, 51, 54, 56, 59, 60, 50, 48, 46, 46, 46, 46,
+ 50, 52, 54, 56, 60, 60, 61, 52, 50, 47, 47, 47, 47, 50,
+ 52, 54, 57, 61, 62, 63, 66, 54, 52, 49, 49, 49, 48, 52,
+ 53, 55, 58, 62, 64, 65, 68, 71, 56, 53, 51, 50, 50, 49,
+ 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 57, 54, 52, 51,
+ 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 60,
+ 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75,
+ 77, 79, 82, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67,
+ 69, 71, 75, 78, 80, 82, 85, 89, 64, 61, 58, 57, 57, 55,
+ 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90,
+ 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75,
+ 79, 81, 83, 86, 90, 91, 91, 67, 63, 61, 60, 59, 57, 60,
+ 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94,
+ 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74,
+ 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 69, 65, 63, 62,
+ 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87,
+ 89, 92, 96, 97, 98, 100, 70, 66, 64, 63, 62, 61, 60, 63,
+ 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+ 99, 100, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68,
+ 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102,
+ 104, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73,
+ 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106,
+ 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76,
+ 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109,
+ 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75,
+ 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74,
+ 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+ 110, 111, 113, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70,
+ 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104,
+ 105, 109, 111, 112, 113, 116, 78, 74, 74, 70, 70, 69, 69, 66,
+ 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96,
+ 97, 102, 102, 107, 107, 112, 113, 115, 115, 118}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 36, 36,
+ 34, 34, 33, 34, 35, 37, 38, 39, 36, 35, 34, 35, 36, 38,
+ 40, 42, 48, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 39,
+ 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 44, 42, 41, 41,
+ 42, 42, 44, 47, 54, 56, 58, 63, 47, 45, 44, 44, 45, 45,
+ 47, 50, 56, 58, 60, 66, 69, 49, 47, 46, 45, 46, 46, 48,
+ 51, 57, 60, 62, 68, 71, 73, 54, 51, 50, 49, 50, 49, 51,
+ 54, 60, 63, 65, 71, 75, 77, 82, 59, 56, 54, 54, 54, 53,
+ 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 61, 58, 56, 56,
+ 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 65,
+ 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92,
+ 98, 101, 105, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78,
+ 84, 89, 92, 97, 103, 106, 111, 117, 76, 72, 70, 69, 68, 66,
+ 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127,
+ 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98,
+ 104, 110, 113, 118, 125, 130, 134, 83, 78, 76, 75, 74, 72, 73,
+ 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137,
+ 140, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100,
+ 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89, 85, 82, 81,
+ 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128,
+ 131, 136, 146, 147, 150, 155, 92, 88, 85, 84, 82, 81, 80, 85,
+ 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+ 154, 156, 162, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95,
+ 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+ 169, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104,
+ 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+ 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112,
+ 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+ 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112,
+ 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+ 193, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111,
+ 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+ 192, 194, 201, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105,
+ 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+ 181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+ 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+ 162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+ {32, 31, 31, 30, 31, 31, 31, 32, 32, 33, 33, 34, 35, 36, 39,
+ 36, 38, 39, 40, 43, 47, 38, 40, 41, 41, 44, 47, 47, 41, 42,
+ 42, 43, 45, 47, 48, 48, 49, 47, 46, 46, 47, 48, 49, 50, 53,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 48, 47, 46, 45, 46,
+ 46, 48, 49, 53, 54, 54, 49, 47, 45, 45, 45, 45, 47, 49, 53,
+ 55, 55, 58, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62,
+ 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66,
+ 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68,
+ 71, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66,
+ 69, 72, 73, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63,
+ 66, 67, 70, 73, 74, 76, 60, 57, 55, 54, 53, 52, 53, 55, 58,
+ 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 62, 59, 57, 56, 55,
+ 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75,
+ 78, 79, 82, 85, 87, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61,
+ 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 66, 63,
+ 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81,
+ 84, 87, 90, 91, 93, 94, 67, 64, 62, 61, 59, 58, 58, 60, 63,
+ 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97,
+ 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75,
+ 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 69, 66, 64, 63, 61,
+ 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88,
+ 91, 92, 97, 98, 98, 101, 70, 67, 65, 63, 62, 62, 60, 61, 63,
+ 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99,
+ 100, 100, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+ 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102,
+ 105, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+ 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107,
+ 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+ 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109,
+ 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76,
+ 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75,
+ 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108,
+ 110, 110, 113}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 34, 34, 35,
+ 34, 34, 33, 33, 35, 36, 37, 39, 34, 34, 34, 34, 36, 36,
+ 37, 41, 42, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 39,
+ 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 41, 39, 39, 38,
+ 40, 40, 41, 46, 48, 51, 55, 56, 44, 42, 41, 41, 42, 42,
+ 42, 47, 50, 54, 58, 59, 63, 48, 46, 45, 44, 45, 45, 45,
+ 50, 53, 56, 61, 62, 66, 70, 49, 47, 46, 45, 46, 46, 46,
+ 51, 53, 57, 62, 63, 68, 71, 73, 54, 51, 50, 49, 50, 49,
+ 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 58, 55, 54, 53,
+ 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 59,
+ 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+ 87, 91, 93, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73,
+ 75, 79, 85, 87, 92, 97, 99, 105, 69, 66, 64, 63, 63, 62,
+ 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113,
+ 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+ 92, 97, 102, 104, 111, 115, 117, 80, 76, 73, 72, 71, 70, 69,
+ 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125,
+ 134, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+ 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 83, 78, 76, 75,
+ 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113,
+ 121, 126, 128, 137, 139, 140, 87, 83, 81, 79, 78, 77, 75, 80,
+ 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142,
+ 143, 145, 150, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89,
+ 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+ 156, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98,
+ 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+ 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105,
+ 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+ 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105,
+ 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+ 176, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104,
+ 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+ 176, 177, 184, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98,
+ 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+ 166, 177, 179, 184, 185, 191, 107, 101, 101, 97, 97, 95, 95, 93,
+ 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+ 149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+ {32, 31, 31, 30, 31, 31, 30, 31, 31, 32, 33, 34, 35, 35, 39,
+ 35, 36, 37, 37, 41, 43, 36, 38, 39, 40, 43, 45, 47, 41, 42,
+ 42, 42, 45, 46, 47, 48, 44, 44, 44, 44, 46, 46, 47, 49, 50,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 46, 45, 46,
+ 46, 46, 49, 51, 53, 54, 48, 47, 46, 45, 46, 46, 46, 49, 51,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+ 58, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61,
+ 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62,
+ 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64,
+ 66, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64,
+ 65, 68, 70, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60,
+ 62, 65, 66, 68, 70, 71, 57, 54, 53, 52, 51, 50, 50, 53, 54,
+ 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 59, 56, 54, 53, 53,
+ 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80,
+ 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69,
+ 72, 75, 76, 79, 81, 82, 63, 60, 58, 57, 56, 55, 54, 57, 59,
+ 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 64, 61,
+ 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78,
+ 79, 82, 85, 86, 89, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59,
+ 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73,
+ 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 68, 64, 63, 61, 60,
+ 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86,
+ 88, 89, 94, 94, 95, 97, 68, 65, 64, 62, 61, 60, 58, 59, 61,
+ 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95,
+ 96, 97, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67,
+ 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99,
+ 101, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71,
+ 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103,
+ 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74,
+ 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+ 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73,
+ 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+ 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72,
+ 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104,
+ 106, 106, 108}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 34, 36, 36, 34, 34, 34, 33, 35, 35,
+ 37, 38, 39, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 36,
+ 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 39, 38, 38, 37,
+ 39, 39, 40, 42, 45, 49, 50, 54, 41, 40, 39, 38, 40, 40,
+ 41, 43, 46, 50, 52, 55, 57, 44, 42, 42, 41, 42, 42, 42,
+ 44, 47, 52, 54, 58, 60, 63, 47, 45, 45, 44, 44, 45, 45,
+ 47, 50, 55, 56, 60, 62, 66, 69, 48, 46, 45, 44, 45, 45,
+ 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 54, 51, 50, 49,
+ 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 56,
+ 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77,
+ 78, 84, 86, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64,
+ 69, 71, 75, 79, 80, 87, 89, 92, 64, 61, 60, 58, 58, 58,
+ 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79,
+ 84, 85, 92, 94, 98, 103, 105, 71, 68, 67, 65, 64, 64, 63,
+ 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111,
+ 117, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83,
+ 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 80, 76, 74, 72,
+ 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106,
+ 110, 116, 118, 125, 128, 134, 82, 78, 76, 74, 73, 73, 71, 73,
+ 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127,
+ 131, 136, 139, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81,
+ 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+ 140, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92,
+ 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+ 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98,
+ 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+ 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98,
+ 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+ 162, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97,
+ 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+ 161, 162, 168, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91,
+ 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+ 152, 162, 164, 168, 168, 174, 100, 95, 95, 90, 90, 89, 89, 86,
+ 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137,
+ 137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34, 37,
+ 33, 34, 35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38, 40,
+ 40, 41, 43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46, 47,
+ 47, 48, 49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48, 49,
+ 52, 53, 54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+ 55, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61,
+ 61, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+ 63, 63, 66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58,
+ 59, 62, 64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49, 52,
+ 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51, 51,
+ 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66,
+ 67, 70, 71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53, 55,
+ 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61, 58,
+ 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73,
+ 74, 76, 79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55, 57,
+ 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70,
+ 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58, 57,
+ 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82,
+ 83, 86, 88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58, 60,
+ 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92,
+ 93, 93, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65,
+ 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95,
+ 97, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69,
+ 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72,
+ 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100,
+ 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71,
+ 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+ 102, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70,
+ 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+ 101, 101, 104}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 33, 34, 35, 35, 33, 33, 33, 33, 34, 35,
+ 36, 36, 38, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 36,
+ 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 36, 35, 35, 34,
+ 35, 36, 38, 38, 42, 43, 48, 49, 39, 38, 38, 37, 38, 39,
+ 40, 40, 44, 45, 50, 51, 54, 41, 39, 39, 38, 39, 40, 40,
+ 41, 45, 46, 51, 52, 55, 56, 44, 42, 42, 41, 41, 42, 42,
+ 42, 46, 47, 54, 54, 58, 59, 63, 46, 44, 44, 42, 43, 44,
+ 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 48, 46, 46, 44,
+ 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 52,
+ 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70,
+ 72, 74, 78, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60,
+ 60, 65, 67, 71, 74, 76, 80, 82, 58, 56, 55, 53, 53, 53,
+ 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91,
+ 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70,
+ 75, 78, 80, 85, 87, 91, 92, 65, 62, 61, 59, 59, 59, 58,
+ 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98,
+ 105, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74,
+ 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 71, 68, 67, 65,
+ 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95,
+ 97, 103, 103, 111, 112, 117, 74, 71, 70, 68, 67, 67, 66, 65,
+ 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114,
+ 115, 120, 123, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79,
+ 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128,
+ 134, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85,
+ 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+ 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88,
+ 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+ 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91,
+ 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+ 147, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91,
+ 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+ 148, 149, 153, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85,
+ 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+ 140, 148, 150, 153, 154, 159, 93, 88, 88, 84, 84, 83, 83, 80,
+ 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126,
+ 126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+ 35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+ 46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+ 46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+ 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+ 46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+ 58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+ 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+ 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+ 65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+ 63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+ 62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+ 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+ 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+ 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+ 73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+ 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+ 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+ 83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+ 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+ 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+ 86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+ 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+ 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+ 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+ 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+ 89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+ 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+ 96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+ 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+ 99}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 34, 34, 35, 32, 32, 32, 32, 32, 34,
+ 34, 35, 35, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 34,
+ 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 36, 35, 35, 34,
+ 34, 36, 36, 38, 38, 42, 42, 48, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 54, 44, 42, 42, 41, 41, 42,
+ 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 44, 42, 42, 41,
+ 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 48,
+ 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61,
+ 67, 67, 71, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51,
+ 57, 57, 61, 61, 67, 67, 71, 71, 54, 51, 51, 49, 49, 50,
+ 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65,
+ 65, 71, 71, 76, 76, 82, 82, 59, 56, 56, 54, 54, 54, 54,
+ 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87,
+ 92, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64,
+ 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 65, 62, 62, 59,
+ 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
+ 85, 92, 92, 98, 98, 105, 65, 62, 62, 59, 59, 59, 59, 58,
+ 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98,
+ 98, 105, 105, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68,
+ 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111,
+ 117, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73,
+ 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 134, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81,
+ 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128,
+ 137, 137, 140, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76,
+ 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121,
+ 128, 128, 137, 137, 140, 140, 87, 83, 83, 79, 79, 77, 77, 75,
+ 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116,
+ 116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+ 34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+ 43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+ 45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+ 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+ 48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+ 53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+ 61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+ 60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+ 57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+ 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+ 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+ 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+ 68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+ 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+ 76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+ 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+ 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+ 79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+ 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+ 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+ 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+ 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+ 83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+ 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+ 90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+ 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+ 95}},
+ {{32, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 34,
+ 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 34, 34, 34, 33,
+ 33, 34, 35, 35, 37, 37, 39, 39, 35, 35, 35, 34, 34, 35,
+ 36, 36, 38, 38, 42, 42, 46, 36, 35, 35, 34, 34, 35, 36,
+ 37, 38, 38, 42, 42, 47, 48, 38, 37, 37, 36, 36, 37, 38,
+ 38, 39, 40, 44, 44, 48, 50, 51, 39, 38, 38, 38, 37, 38,
+ 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 41, 40, 40, 39,
+ 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 44,
+ 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56,
+ 58, 60, 63, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48,
+ 48, 53, 54, 57, 58, 60, 64, 65, 48, 46, 46, 45, 44, 45,
+ 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57,
+ 59, 61, 63, 67, 68, 71, 71, 53, 51, 51, 49, 49, 49, 49,
+ 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75,
+ 81, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59,
+ 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 57, 55, 55, 53,
+ 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+ 75, 79, 79, 85, 85, 89, 59, 56, 56, 54, 54, 54, 54, 54,
+ 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86,
+ 87, 90, 92, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60,
+ 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95,
+ 98, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67,
+ 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105,
+ 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69,
+ 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
+ 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73,
+ 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+ 117, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+ 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111,
+ 113, 118, 119, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74,
+ 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110,
+ 114, 118, 120, 125, 126, 134, 80, 76, 76, 73, 72, 72, 71, 70,
+ 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103,
+ 104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+ 33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+ 40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+ 43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+ 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+ 47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+ 50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+ 53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+ 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+ 57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+ 55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+ 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+ 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+ 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+ 63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+ 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+ 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+ 70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+ 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+ 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+ 72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+ 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+ 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+ 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+ 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+ 76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+ 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+ 85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+ 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+ 89}},
+ {{32, 31, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 32,
+ 32, 33, 34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34, 35, 35,
+ 36, 36, 38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+ 39, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 48, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+ 50, 50, 52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44,
+ 45, 47, 50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40, 40, 40,
+ 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42, 42, 41,
+ 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54,
+ 54, 57, 58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44, 45, 45,
+ 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 48, 47,
+ 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60,
+ 61, 63, 67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47, 47, 47,
+ 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60,
+ 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51, 50, 49,
+ 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71,
+ 71, 75, 76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53, 53, 53,
+ 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82,
+ 86, 86, 90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57,
+ 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91,
+ 92, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62,
+ 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 105, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69,
+ 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+ 109, 109, 114},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+ 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+ 38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+ 42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+ 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+ 45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+ 49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+ 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+ 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+ 54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+ 53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+ 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+ 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+ 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+ 60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+ 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+ 65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+ 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+ 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+ 66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+ 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+ 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+ 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+ 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+ 70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+ 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+ 74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+ 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+ 80}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+ 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+ 37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+ 39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+ 50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+ 45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+ 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+ 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+ 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+ 58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+ 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+ 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+ 66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+ 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+ 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+ 68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+ 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+ 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+ 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+ 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+ 76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+ 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+ 87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+ 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+ 92},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+ 35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+ 37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+ 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+ 43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+ 48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+ 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+ 49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+ 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+ 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+ 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+ 55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+ 59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+ 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+ 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+ 60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+ 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+ 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+ 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+ 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+ 63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+ 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+ 68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+ 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+ 71}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+ 36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+ 41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+ 42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+ 40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+ 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+ 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+ 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+ 55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+ 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+ 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+ 58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+ 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+ 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+ 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+ 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+ 63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+ 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+ 70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+ 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+ 77},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+ 35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+ 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+ 40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+ 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+ 49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+ 50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+ 49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+ 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+ 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+ 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+ 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+ 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+ 55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+ 55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+ 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+ 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+ 58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+ 61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+ 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+ 64}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+ 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+ 42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+ 48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+ 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+ 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+ 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+ 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+ 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+ 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+ 52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+ 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+ 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+ 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+ 63},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+ 32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+ 35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+ 41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+ 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+ 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+ 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+ 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+ 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+ 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+ 50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+ 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+ 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+ 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+ 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+ 53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+ 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+ 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+ 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+ 53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+ 55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+ 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+ 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+ 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+ 41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+ 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+ 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+ 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+ 42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+ 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+ 48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+ 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+ 50},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+ 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+ 37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+ 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+ 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+ 46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+ 43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+ 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+ 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+ 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+ 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+ 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+ 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+ 49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+ 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+ 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+ 50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+ 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+ 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+ 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+ 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+ 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+ 38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+ 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+ 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+ 38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+ 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+ 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+ 41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+ 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+ 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+ 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+ 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+ 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+ 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+ 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+ 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+ 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+ 37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+ 38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+ 40}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32}}};
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(QuantizerTest, GetQIndex) {
+ const int kBaseQIndex = 40;
+ const int kDelta = 10;
+ const int kOutOfRangeIndex = 200;
+ Segmentation segmentation = {};
+
+ EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+ EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+ kBaseQIndex);
+
+ segmentation.enabled = true;
+ EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+ EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+ kBaseQIndex);
+
+ segmentation.feature_enabled[1][kSegmentFeatureQuantizer] = true;
+ segmentation.feature_data[1][kSegmentFeatureQuantizer] = kDelta;
+ EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex + kDelta);
+ EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+ kBaseQIndex);
+
+ segmentation.enabled = false;
+ EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex);
+ EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+ kBaseQIndex);
+}
+
+TEST(QuantizerTest, GetDcValue) {
+ QuantizerParameters params = {};
+ params.delta_dc[kPlaneY] = 1;
+ params.delta_dc[kPlaneU] = 2;
+ params.delta_dc[kPlaneV] = 3;
+
+ // Test lookups of Dc_Qlookup[0][0], Dc_Qlookup[0][11], Dc_Qlookup[0][12],
+ // and Dc_Qlookup[0][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(8, ¶ms);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 16);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 17);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 1336);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 1336);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 16);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 17);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 1336);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 1336);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 16);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 17);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 252), 1336);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 1336);
+ }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Test lookups of Dc_Qlookup[1][0], Dc_Qlookup[1][11], Dc_Qlookup[1][12],
+ // and Dc_Qlookup[1][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(10, ¶ms);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 34);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 37);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 5347);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 5347);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 34);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 37);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 5347);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 5347);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 34);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 37);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 5347);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 5347);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Test lookups of Dc_Qlookup[2][0], Dc_Qlookup[2][11], Dc_Qlookup[2][12],
+ // and Dc_Qlookup[2][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(12, ¶ms);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 21387);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+}
+
+TEST(QuantizerTest, GetAcValue) {
+ QuantizerParameters params = {};
+ params.delta_ac[kPlaneU] = 1;
+ params.delta_ac[kPlaneV] = 2;
+
+ // Test lookups of Ac_Qlookup[0][0], Ac_Qlookup[0][11], Ac_Qlookup[0][12],
+ // and Ac_Qlookup[0][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(8, ¶ms);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 18);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 19);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 1828);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 1828);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 18);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 19);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 1828);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 1828);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 18);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 19);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 1828);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 1828);
+ }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+ // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(10, ¶ms);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 37);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 40);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 7312);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 7312);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 37);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 40);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 7312);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 7312);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 37);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 40);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 7312);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 7312);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+ // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(12, ¶ms);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 29247);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr dsp::Transform1d kRowTransform[kNumTransformTypes] = {
+ dsp::kTransform1dDct, dsp::kTransform1dAdst,
+ dsp::kTransform1dDct, dsp::kTransform1dAdst,
+ dsp::kTransform1dAdst, dsp::kTransform1dDct,
+ dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+ dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+ dsp::kTransform1dIdentity, dsp::kTransform1dDct,
+ dsp::kTransform1dIdentity, dsp::kTransform1dAdst,
+ dsp::kTransform1dIdentity, dsp::kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr dsp::Transform1d kColumnTransform[kNumTransformTypes] = {
+ dsp::kTransform1dDct, dsp::kTransform1dDct,
+ dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+ dsp::kTransform1dDct, dsp::kTransform1dAdst,
+ dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+ dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+ dsp::kTransform1dDct, dsp::kTransform1dIdentity,
+ dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+ dsp::kTransform1dAdst, dsp::kTransform1dIdentity};
+
+dsp::Transform1dSize GetTransform1dSize(int size_log2) {
+ return static_cast<dsp::Transform1dSize>(size_log2 - 2);
+}
+
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+ const TransformClass tx_class = GetTransformClass(tx_type);
+
+ switch (tx_class) {
+ case kTransformClass2D:
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 13) return 4;
+ if (non_zero_coeff_count <= 29) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+ if (non_zero_coeff_count <= 43) return 8;
+ if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+ if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+ }
+ break;
+
+ case kTransformClassHorizontal:
+ if (non_zero_coeff_count <= 4) return 4;
+ if (non_zero_coeff_count <= 8) return 8;
+ if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+ break;
+
+ default:
+ assert(tx_class == kTransformClassVertical);
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 16) return 4;
+ if (non_zero_coeff_count <= 32) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 32) return 4;
+ if (non_zero_coeff_count <= 64) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 8x8: 63, 8x16: 127.
+ if (non_zero_coeff_count <= 128) return 16;
+ if (non_zero_coeff_count <= 192) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 64) return 4;
+ if (non_zero_coeff_count <= 128) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 16x8: 127, 16x16: 255.
+ if (non_zero_coeff_count <= 256) return 16;
+ if (non_zero_coeff_count <= 384) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 128) return 4;
+ if (non_zero_coeff_count <= 256) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 32x8 is 255, 32x16 is 511.
+ if ((non_zero_coeff_count <= 512)) return 16;
+ if ((non_zero_coeff_count <= 768)) return 24;
+ }
+ break;
+ }
+ return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
+} // namespace
+
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, Residual* const buffer,
+ int start_x, int start_y, Array2DView<Pixel>* frame,
+ int non_zero_coeff_count) {
+ static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, "");
+ const int tx_width_log2 = kTransformWidthLog2[tx_size];
+ const int tx_height_log2 = kTransformHeightLog2[tx_size];
+
+ int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+ if (tx_height > 4) {
+ static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+ int non_zero_coeff_count) = {
+ &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+ &GetNumRows<32>};
+ tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+ non_zero_coeff_count);
+ }
+ assert(tx_height <= 32);
+
+ // Row transform.
+ const dsp::Transform1dSize row_transform_size =
+ GetTransform1dSize(tx_width_log2);
+ const dsp::Transform1d row_transform =
+ lossless ? dsp::kTransform1dWht : kRowTransform[tx_type];
+ const dsp::InverseTransformAddFunc row_transform_func =
+ dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
+ assert(row_transform_func != nullptr);
+
+ row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
+
+ // Column transform.
+ const dsp::Transform1dSize column_transform_size =
+ GetTransform1dSize(tx_height_log2);
+ const dsp::Transform1d column_transform =
+ lossless ? dsp::kTransform1dWht : kColumnTransform[tx_type];
+ const dsp::InverseTransformAddFunc column_transform_func =
+ dsp.inverse_transforms[column_transform][column_transform_size]
+ [dsp::kColumn];
+ assert(column_transform_func != nullptr);
+
+ column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
+}
+
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, int16_t* buffer,
+ int start_x, int start_y, Array2DView<uint8_t>* frame,
+ int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, int32_t* buffer,
+ int start_x, int start_y,
+ Array2DView<uint16_t>* frame,
+ int non_zero_coeff_count);
+#endif
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_
+#define LIBGAV1_SRC_RECONSTRUCTION_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the frame for the
+// transform block size |tx_size| starting at position |start_x| and |start_y|.
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless, Residual* buffer,
+ int start_x, int start_y, Array2DView<Pixel>* frame,
+ int non_zero_coeff_count);
+
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless,
+ int16_t* buffer, int start_x, int start_y,
+ Array2DView<uint8_t>* frame,
+ int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+ TransformSize tx_size, bool lossless,
+ int32_t* buffer, int start_x, int start_y,
+ Array2DView<uint16_t>* frame,
+ int non_zero_coeff_count);
+#endif
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_RECONSTRUCTION_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// Import the scan tables in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+constexpr int kTestTransformSize = 4;
+constexpr int8_t kTestBitdepth = 8;
+
+using testing::ElementsAreArray;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class ReconstructionTest : public testing::TestWithParam<int> {
+ public:
+ ReconstructionTest() = default;
+ ReconstructionTest(const ReconstructionTest&) = delete;
+ ReconstructionTest& operator=(const ReconstructionTest&) = delete;
+ ~ReconstructionTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(kTestBitdepth);
+ dsp::InverseTransformInit_C();
+ dsp_ = dsp::GetDspTable(kTestBitdepth);
+ ASSERT_NE(dsp_, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ if (test_info->value_param() != nullptr) {
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) == 0) GTEST_SKIP() << "No SSE4.1 support!";
+ dsp::InverseTransformInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ dsp::InverseTransformInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ }
+ InitBuffers();
+ }
+
+ void InitBuffers(int width = kTestTransformSize,
+ int height = kTestTransformSize) {
+ const int size = width * height;
+ buffer_.clear();
+ buffer_.resize(size);
+ residual_buffer_.clear();
+ residual_buffer_.resize(size);
+ for (int i = 0; i < size; ++i) {
+ buffer_[i] = residual_buffer_[i] = i % 256;
+ }
+ frame_buffer_.Reset(height, width, buffer_.data());
+ }
+
+ template <int bitdepth>
+ void TestWht();
+
+ std::vector<uint8_t> buffer_;
+ std::vector<int16_t> residual_buffer_;
+ // |frame_buffer_| is just a 2D array view into the |buffer_|.
+ Array2DView<uint8_t> frame_buffer_;
+ const dsp::Dsp* dsp_;
+};
+
+template <int bitdepth>
+void ReconstructionTest::TestWht() {
+ static_assert(bitdepth == kBitdepth8 || bitdepth == kBitdepth10, "");
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dWht][dsp::kTransform1dSize4]) {
+ if (transform == nullptr) {
+ GTEST_SKIP() << "No function available for dsp::kTransform1dWht";
+ }
+ }
+ constexpr int max = 16 << bitdepth;
+ constexpr int min = -max;
+ static constexpr int16_t residual_inputs[][16]{
+ {64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, max - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, min - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ // Note these are unrealistic inputs, but serve to test each position in
+ // the array and match extremes in some commercial test vectors.
+ {max, max, max, max, max, max, max, max, max, max, max, max, max, max,
+ max, max},
+ {min, min, min, min, min, min, min, min, min, min, min, min, min, min,
+ min, min}};
+ // Before the Reconstruct() call, the frame buffer is filled with all 127.
+ // After the Reconstruct() call, the frame buffer is expected to have the
+ // following values.
+ static constexpr uint8_t frame_outputs[][16]{
+ {131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131},
+ {132, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+ 131, 131},
+ {255, 255, 0, 0, 255, 255, 0, 0, 0, 0, 255, 255, 0, 0, 255, 255},
+ {0, 0, 255, 255, 0, 0, 255, 255, 255, 255, 0, 0, 255, 255, 0, 0},
+ {255, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+ 127, 127},
+ {0, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+ 127},
+ };
+
+ const TransformSize tx_size = kTransformSize4x4;
+ const TransformType tx_type = kTransformTypeDctDct;
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const uint16_t* const scan = kScan[GetTransformClass(tx_type)][tx_size];
+
+ InitBuffers(tx_width, tx_height);
+
+ const int num_tests = sizeof(residual_inputs) / sizeof(residual_inputs[0]);
+ for (int i = 0; i < num_tests; ++i) {
+ int16_t eob; // Also known as non_zero_coeff_count.
+ for (eob = 15; eob >= 0; --eob) {
+ if (residual_inputs[i][scan[eob]] != 0) break;
+ }
+ ++eob;
+ memcpy(residual_buffer_.data(), residual_inputs[i],
+ sizeof(residual_inputs[i]));
+ memset(buffer_.data(), 127, sizeof(frame_outputs[i]));
+ Reconstruct(*dsp_, tx_type, tx_size, /*lossless=*/true,
+ residual_buffer_.data(), 0, 0, &frame_buffer_, eob);
+
+ EXPECT_TRUE(test_utils::CompareBlocks(buffer_.data(), frame_outputs[i],
+ tx_width, tx_height, tx_width,
+ tx_width, false, true))
+ << "Mismatch WHT test case " << i;
+ }
+}
+
+TEST_P(ReconstructionTest, ReconstructionSimple) {
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+ [dsp::kTransform1dSize4]) {
+ if (transform == nullptr) GTEST_SKIP();
+ }
+ Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+ residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+ // clang-format off
+ static constexpr uint8_t expected_output_buffer[] = {
+ 0, 1, 2, 3,
+ 5, 6, 7, 8,
+ 9, 10, 11, 12,
+ 14, 15, 16, 17
+ };
+ // clang-format on
+ EXPECT_THAT(buffer_, ElementsAreArray(expected_output_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipY) {
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+ [dsp::kTransform1dSize4]) {
+ if (transform == nullptr) GTEST_SKIP();
+ }
+ Reconstruct(*dsp_, kTransformTypeIdentityFlipadst, kTransformSize4x4, false,
+ residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+ // clang-format off
+ static constexpr uint8_t expected_buffer[] = {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 7, 8, 9, 10,
+ 14, 15, 16, 17
+ };
+ // clang-format on
+ EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipX) {
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+ [dsp::kTransform1dSize4]) {
+ if (transform == nullptr) GTEST_SKIP();
+ }
+ Reconstruct(*dsp_, kTransformTypeFlipadstIdentity, kTransformSize4x4, false,
+ residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+ // clang-format off
+ static constexpr uint8_t expected_buffer[] = {
+ 0, 1, 2, 3,
+ 4, 5, 6, 8,
+ 8, 10, 10, 13,
+ 12, 14, 14, 18
+ };
+ // clang-format on
+ EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipXAndFlipY) {
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+ [dsp::kTransform1dSize4]) {
+ if (transform == nullptr) GTEST_SKIP();
+ }
+ Reconstruct(*dsp_, kTransformTypeFlipadstFlipadst, kTransformSize4x4, false,
+ residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+ // clang-format off
+ static constexpr uint8_t expected_buffer[] = {
+ 0, 1, 2, 3,
+ 4, 5, 6, 8,
+ 8, 8, 10, 9,
+ 12, 14, 14, 19
+ };
+ // clang-format on
+ EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionNonZeroStart) {
+ uint8_t buffer[64] = {};
+ Array2DView<uint8_t> frame_buffer(8, 8, buffer);
+ int k = 0;
+ for (int i = 0; i < kTestTransformSize; ++i) {
+ for (int j = 0; j < kTestTransformSize; ++j) {
+ frame_buffer[i + 4][j + 4] = k++;
+ }
+ }
+ for (const auto transform :
+ dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+ [dsp::kTransform1dSize4]) {
+ if (transform == nullptr) GTEST_SKIP();
+ }
+ Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+ residual_buffer_.data(), 4, 4, &frame_buffer, 64);
+ // clang-format off
+ static constexpr uint8_t expected_buffer[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 2, 3,
+ 0, 0, 0, 0, 5, 6, 7, 8,
+ 0, 0, 0, 0, 9, 10, 11, 12,
+ 0, 0, 0, 0, 14, 15, 16, 17
+ };
+ // clang-format on
+ EXPECT_THAT(buffer, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, Wht8bit) { TestWht<kBitdepth8>(); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(ReconstructionTest, Wht10bit) { TestWht<kBitdepth10>(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, ReconstructionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ReconstructionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ReconstructionTest, testing::Values(0));
+#endif
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <utility>
+
+namespace libgav1 {
+namespace {
+
+// The maximum queue size is derived using the following formula:
+// ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)).
+// Where:
+// sb_size is the superblock size (64 or 128).
+// 16 is 4*4 which is kMinTransformWidth * kMinTransformHeight.
+// x is subsampling_x + 1.
+// y is subsampling_y + 1.
+// The first component is for the Y plane and the second component is for the U
+// and V planes.
+// For example, for 128x128 superblocks with 422 subsampling the size is:
+// ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048.
+//
+// First dimension: use_128x128_superblock.
+// Second dimension: subsampling_x.
+// Third dimension: subsampling_y.
+constexpr int kMaxQueueSize[2][2][2] = {
+ // 64x64 superblocks.
+ {
+ {768, 512},
+ {512, 384},
+ },
+ // 128x128 superblocks.
+ {
+ {3072, 2048},
+ {2048, 1536},
+ },
+};
+
+} // namespace
+
+ResidualBufferStack::~ResidualBufferStack() {
+ while (top_ != nullptr) {
+ ResidualBuffer* top = top_;
+ top_ = top_->next_;
+ delete top;
+ }
+}
+
+void ResidualBufferStack::Push(std::unique_ptr<ResidualBuffer> buffer) {
+ buffer->next_ = top_;
+ top_ = buffer.release();
+ ++num_buffers_;
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferStack::Pop() {
+ std::unique_ptr<ResidualBuffer> top;
+ if (top_ != nullptr) {
+ top.reset(top_);
+ top_ = top_->next_;
+ top->next_ = nullptr;
+ --num_buffers_;
+ }
+ return top;
+}
+
+void ResidualBufferStack::Swap(ResidualBufferStack* other) {
+ std::swap(top_, other->top_);
+ std::swap(num_buffers_, other->num_buffers_);
+}
+
+ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock,
+ int subsampling_x, int subsampling_y,
+ size_t residual_size)
+ : buffer_size_(GetResidualBufferSize(
+ use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+ subsampling_x, subsampling_y, residual_size)),
+ queue_size_(kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+ [subsampling_x][subsampling_y]) {}
+
+void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x,
+ int subsampling_y, size_t residual_size) {
+ const size_t buffer_size = GetResidualBufferSize(
+ use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+ subsampling_x, subsampling_y, residual_size);
+ const int queue_size = kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+ [subsampling_x][subsampling_y];
+ if (buffer_size == buffer_size_ && queue_size == queue_size_) {
+ // The existing buffers (if any) are still valid, so don't do anything.
+ return;
+ }
+ buffer_size_ = buffer_size;
+ queue_size_ = queue_size;
+ // The existing buffers (if any) are no longer valid since the buffer size or
+ // the queue size has changed. Clear the stack.
+ ResidualBufferStack buffers;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ // Move the buffers in the stack to the local variable |buffers| and clear
+ // the stack.
+ buffers.Swap(&buffers_);
+ // Release mutex_ before freeing the buffers.
+ }
+ // As the local variable |buffers| goes out of scope, its destructor frees
+ // the buffers that were in the stack.
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
+ std::unique_ptr<ResidualBuffer> buffer = nullptr;
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffer = buffers_.Pop();
+ }
+ if (buffer == nullptr) {
+ buffer = ResidualBuffer::Create(buffer_size_, queue_size_);
+ }
+ return buffer;
+}
+
+void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
+ buffer->transform_parameters()->Clear();
+ buffer->partition_tree_order()->Clear();
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(buffer));
+}
+
+size_t ResidualBufferPool::Size() const {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return buffers_.Size();
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This class is used for parsing and decoding a superblock. Members of this
+// class are populated in the "parse" step and consumed in the "decode" step.
+class ResidualBuffer : public Allocable {
+ public:
+ static std::unique_ptr<ResidualBuffer> Create(size_t buffer_size,
+ int queue_size) {
+ std::unique_ptr<ResidualBuffer> buffer(new (std::nothrow) ResidualBuffer);
+ if (buffer != nullptr) {
+ buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
+ if (buffer->buffer_ == nullptr ||
+ !buffer->transform_parameters_.Init(queue_size) ||
+ !buffer->partition_tree_order_.Init(queue_size)) {
+ buffer = nullptr;
+ }
+ }
+ return buffer;
+ }
+
+ // Move only.
+ ResidualBuffer(ResidualBuffer&& other) = default;
+ ResidualBuffer& operator=(ResidualBuffer&& other) = default;
+
+ // Buffer used to store the residual values.
+ uint8_t* buffer() { return buffer_.get(); }
+ // Queue used to store the transform parameters.
+ Queue<TransformParameters>* transform_parameters() {
+ return &transform_parameters_;
+ }
+ // Queue used to store the block ordering in the partition tree of the
+ // superblocks.
+ Queue<PartitionTreeNode>* partition_tree_order() {
+ return &partition_tree_order_;
+ }
+
+ private:
+ friend class ResidualBufferStack;
+
+ ResidualBuffer() = default;
+
+ AlignedUniquePtr<uint8_t> buffer_;
+ Queue<TransformParameters> transform_parameters_;
+ Queue<PartitionTreeNode> partition_tree_order_;
+ // Used by ResidualBufferStack to form a chain of ResidualBuffers.
+ ResidualBuffer* next_ = nullptr;
+};
+
+// A LIFO stack of ResidualBuffers. Owns the buffers in the stack.
+class ResidualBufferStack {
+ public:
+ ResidualBufferStack() = default;
+
+ // Not copyable or movable
+ ResidualBufferStack(const ResidualBufferStack&) = delete;
+ ResidualBufferStack& operator=(const ResidualBufferStack&) = delete;
+
+ ~ResidualBufferStack();
+
+ // Pushes |buffer| to the top of the stack.
+ void Push(std::unique_ptr<ResidualBuffer> buffer);
+
+ // If the stack is non-empty, returns the buffer at the top of the stack and
+ // removes it from the stack. If the stack is empty, returns nullptr.
+ std::unique_ptr<ResidualBuffer> Pop();
+
+ // Swaps the contents of this stack and |other|.
+ void Swap(ResidualBufferStack* other);
+
+ // Returns the number of buffers in the stack.
+ size_t Size() const { return num_buffers_; }
+
+ private:
+ // A singly-linked list of ResidualBuffers, chained together using the next_
+ // field of ResidualBuffer.
+ ResidualBuffer* top_ = nullptr;
+ size_t num_buffers_ = 0;
+};
+
+// Utility class used to manage the residual buffers (and the transform
+// parameters) used for multi-threaded decoding. This class uses a stack to
+// store the buffers for better cache locality. Since buffers used more recently
+// are more likely to be in the cache. All functions in this class are
+// thread-safe.
+class ResidualBufferPool : public Allocable {
+ public:
+ ResidualBufferPool(bool use_128x128_superblock, int subsampling_x,
+ int subsampling_y, size_t residual_size);
+
+ // Recomputes |buffer_size_| and invalidates the existing buffers if
+ // necessary.
+ void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y,
+ size_t residual_size);
+ // Gets a residual buffer. The buffer is guaranteed to be large enough to
+ // store the residual values for one superblock whose parameters are the same
+ // as the constructor or the last call to Reset(). If there are free buffers
+ // in the stack, it returns one from the stack, otherwise a new buffer is
+ // allocated.
+ std::unique_ptr<ResidualBuffer> Get();
+ // Returns the |buffer| back to the pool (by appending it to the stack).
+ // Subsequent calls to Get() may re-use this buffer.
+ void Release(std::unique_ptr<ResidualBuffer> buffer);
+
+ // Used only in the tests. Returns the number of buffers in the stack.
+ size_t Size() const;
+
+ private:
+ mutable std::mutex mutex_;
+ ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_);
+ size_t buffer_size_;
+ int queue_size_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(ResidualBufferTest, TestUsage) {
+ ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+ EXPECT_EQ(pool.Size(), 0);
+ // Get one buffer.
+ std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+ uint8_t* const buffer1_ptr = buffer1->buffer();
+ ASSERT_NE(buffer1_ptr, nullptr);
+ // Get another buffer (while holding on to the first one).
+ std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+ uint8_t* const buffer2_ptr = buffer2->buffer();
+ ASSERT_NE(buffer2_ptr, nullptr);
+ EXPECT_NE(buffer1_ptr, buffer2_ptr);
+ // Return the second buffer.
+ pool.Release(std::move(buffer2));
+ EXPECT_EQ(pool.Size(), 1);
+ // Get another buffer (this one should be the same as the buffer2).
+ std::unique_ptr<ResidualBuffer> buffer3 = pool.Get();
+ uint8_t* const buffer3_ptr = buffer3->buffer();
+ ASSERT_NE(buffer3_ptr, nullptr);
+ EXPECT_EQ(buffer3_ptr, buffer2_ptr);
+ EXPECT_EQ(pool.Size(), 0);
+ // Get another buffer (this one will be a new buffer).
+ std::unique_ptr<ResidualBuffer> buffer4 = pool.Get();
+ uint8_t* const buffer4_ptr = buffer4->buffer();
+ ASSERT_NE(buffer4_ptr, nullptr);
+ EXPECT_NE(buffer4_ptr, buffer1_ptr);
+ EXPECT_NE(buffer4_ptr, buffer3_ptr);
+ EXPECT_EQ(pool.Size(), 0);
+ // Return all the buffers.
+ pool.Release(std::move(buffer1));
+ EXPECT_EQ(pool.Size(), 1);
+ pool.Release(std::move(buffer3));
+ EXPECT_EQ(pool.Size(), 2);
+ pool.Release(std::move(buffer4));
+ EXPECT_EQ(pool.Size(), 3);
+ // Reset the buffer with same parameters.
+ pool.Reset(true, 1, 1, sizeof(int16_t));
+ EXPECT_EQ(pool.Size(), 3);
+ // Reset the buffer size with different parameters.
+ pool.Reset(true, 0, 1, sizeof(int32_t));
+ // The existing buffers should now have been invalidated.
+ EXPECT_EQ(pool.Size(), 0);
+ // Get and return a buffer.
+ std::unique_ptr<ResidualBuffer> buffer5 = pool.Get();
+ uint8_t* const buffer5_ptr = buffer5->buffer();
+ ASSERT_NE(buffer5_ptr, nullptr);
+ pool.Release(std::move(buffer5));
+ EXPECT_EQ(pool.Size(), 1);
+ // Reset the buffer with different value for use128x128_superblock.
+ pool.Reset(false, 0, 1, sizeof(int32_t));
+ // The existing buffers should now have been invalidated.
+ EXPECT_EQ(pool.Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestQueue) {
+ ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+ EXPECT_EQ(pool.Size(), 0);
+ // Get one buffer.
+ std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+ uint8_t* const buffer1_ptr = buffer1->buffer();
+ ASSERT_NE(buffer1_ptr, nullptr);
+ auto* queue1 = buffer1->transform_parameters();
+ queue1->Push(TransformParameters(kTransformTypeAdstAdst, 10));
+ EXPECT_EQ(queue1->Size(), 1);
+ EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+ EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+ queue1->Push(TransformParameters(kTransformTypeDctDct, 20));
+ EXPECT_EQ(queue1->Size(), 2);
+ EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+ EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+ queue1->Pop();
+ EXPECT_EQ(queue1->Size(), 1);
+ EXPECT_EQ(queue1->Front().type, kTransformTypeDctDct);
+ EXPECT_EQ(queue1->Front().non_zero_coeff_count, 20);
+ // Return the buffer.
+ pool.Release(std::move(buffer1));
+ EXPECT_EQ(pool.Size(), 1);
+ // Get another buffer (should be the same as buffer1).
+ std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+ uint8_t* const buffer2_ptr = buffer2->buffer();
+ ASSERT_NE(buffer2_ptr, nullptr);
+ EXPECT_EQ(buffer1_ptr, buffer2_ptr);
+ // Releasing the buffer should've cleared the queue.
+ EXPECT_EQ(buffer2->transform_parameters()->Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestStackPushPop) {
+ ResidualBufferStack buffers;
+ EXPECT_EQ(buffers.Size(), 0);
+ EXPECT_EQ(buffers.Pop(), nullptr);
+
+ std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer0_ptr = buffer0.get();
+ EXPECT_NE(buffer0_ptr, nullptr);
+ std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer1_ptr = buffer1.get();
+ EXPECT_NE(buffer1_ptr, nullptr);
+ std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer2_ptr = buffer2.get();
+ EXPECT_NE(buffer2_ptr, nullptr);
+
+ // Push two buffers onto the stack.
+ buffers.Push(std::move(buffer0));
+ EXPECT_EQ(buffers.Size(), 1);
+ buffers.Push(std::move(buffer1));
+ EXPECT_EQ(buffers.Size(), 2);
+
+ // Pop one buffer off the stack.
+ std::unique_ptr<ResidualBuffer> top = buffers.Pop();
+ EXPECT_EQ(buffers.Size(), 1);
+ EXPECT_EQ(top.get(), buffer1_ptr);
+
+ // Push one buffer onto the stack.
+ buffers.Push(std::move(buffer2));
+ EXPECT_EQ(buffers.Size(), 2);
+
+ // Pop two buffers off the stack
+ top = buffers.Pop();
+ EXPECT_EQ(buffers.Size(), 1);
+ EXPECT_EQ(top.get(), buffer2_ptr);
+ top = buffers.Pop();
+ EXPECT_EQ(buffers.Size(), 0);
+ EXPECT_EQ(top.get(), buffer0_ptr);
+
+ // Try to pop a buffer off an empty stack.
+ top = buffers.Pop();
+ EXPECT_EQ(buffers.Size(), 0);
+ EXPECT_EQ(top, nullptr);
+}
+
+TEST(ResidualBufferTest, TestStackSwap) {
+ ResidualBufferStack buffers;
+ EXPECT_EQ(buffers.Size(), 0);
+ EXPECT_EQ(buffers.Pop(), nullptr);
+
+ std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer0_ptr = buffer0.get();
+ EXPECT_NE(buffer0_ptr, nullptr);
+ std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer1_ptr = buffer1.get();
+ EXPECT_NE(buffer1_ptr, nullptr);
+ std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+ ResidualBuffer* const buffer2_ptr = buffer2.get();
+ EXPECT_NE(buffer2_ptr, nullptr);
+
+ // Push three buffers onto the stack.
+ buffers.Push(std::move(buffer0));
+ EXPECT_EQ(buffers.Size(), 1);
+ buffers.Push(std::move(buffer1));
+ EXPECT_EQ(buffers.Size(), 2);
+ buffers.Push(std::move(buffer2));
+ EXPECT_EQ(buffers.Size(), 3);
+
+ // Swap the contents of the stacks.
+ ResidualBufferStack swapped;
+ swapped.Swap(&buffers);
+ EXPECT_EQ(buffers.Size(), 0);
+ EXPECT_EQ(swapped.Size(), 3);
+
+ // Pop three buffers off the swapped stack.
+ std::unique_ptr<ResidualBuffer> top = swapped.Pop();
+ EXPECT_EQ(swapped.Size(), 2);
+ EXPECT_EQ(top.get(), buffer2_ptr);
+ top = swapped.Pop();
+ EXPECT_EQ(swapped.Size(), 1);
+ EXPECT_EQ(top.get(), buffer1_ptr);
+ top = swapped.Pop();
+ EXPECT_EQ(swapped.Size(), 0);
+ EXPECT_EQ(top.get(), buffer0_ptr);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains all the scan order tables.
+
+constexpr uint16_t kDefaultScan4x4[16] = {0, 1, 4, 8, 5, 2, 3, 6,
+ 9, 12, 13, 10, 7, 11, 14, 15};
+
+constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+constexpr uint16_t kRowScan4x4[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr uint16_t kDefaultScan4x8[32] = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31};
+
+constexpr uint16_t kColumnScan4x8[32] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31};
+
+constexpr uint16_t kRowScan4x8[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x4[32] = {
+ 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19,
+ 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31};
+
+constexpr uint16_t kColumnScan8x4[32] = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31};
+
+constexpr uint16_t kRowScan8x4[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x8[64] = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63};
+
+constexpr uint16_t kColumnScan8x8[64] = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63};
+
+constexpr uint16_t kRowScan8x8[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x16[128] = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110,
+ 117, 124, 111, 118, 125, 119, 126, 127};
+
+constexpr uint16_t kColumnScan8x16[128] = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127};
+
+constexpr uint16_t kRowScan8x16[128] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x8[128] = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127};
+
+constexpr uint16_t kColumnScan16x8[128] = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127};
+
+constexpr uint16_t kRowScan16x8[128] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x16[256] = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4,
+ 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22,
+ 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100,
+ 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27,
+ 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46,
+ 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+ 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94,
+ 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+ 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+ 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+ 255};
+
+constexpr uint16_t kColumnScan16x16[256] = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255};
+
+constexpr uint16_t kRowScan16x16[256] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255};
+
+constexpr uint16_t kDefaultScan16x32[512] = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64,
+ 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22,
+ 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70,
+ 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177,
+ 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226,
+ 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227,
+ 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228,
+ 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229,
+ 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230,
+ 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+ 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+ 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+ 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+ 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+ 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+ 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+ 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+ 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+ 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+ 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+ 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+ 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+ 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+ 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+ 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+ 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+ 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+ 510, 511};
+
+constexpr uint16_t kDefaultScan32x16[512] = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8,
+ 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196,
+ 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104,
+ 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43,
+ 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13,
+ 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14,
+ 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46,
+ 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+ 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+ 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+ 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+ 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+ 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238,
+ 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302,
+ 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334,
+ 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366,
+ 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398,
+ 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430,
+ 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462,
+ 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494,
+ 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29,
+ 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61,
+ 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+ 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+ 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+ 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+ 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+ 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+ 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+ 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+ 479, 511};
+
+constexpr uint16_t kDefaultScan32x32[1024] = {
+ 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66,
+ 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68,
+ 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 256, 225, 194,
+ 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195,
+ 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41,
+ 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352,
+ 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12,
+ 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385,
+ 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107,
+ 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294,
+ 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326,
+ 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79,
+ 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482,
+ 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266,
+ 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, 81, 112, 143,
+ 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546,
+ 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330,
+ 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 21, 52, 83,
+ 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486,
+ 517, 548, 579, 610, 641, 672, 704, 673, 642, 611, 580, 549, 518,
+ 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115,
+ 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302,
+ 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705,
+ 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427,
+ 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24,
+ 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397,
+ 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800,
+ 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460,
+ 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57,
+ 26, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368,
+ 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771,
+ 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617,
+ 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214,
+ 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215,
+ 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618,
+ 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898,
+ 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495,
+ 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,
+ 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341,
+ 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744,
+ 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869,
+ 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466,
+ 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63,
+ 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
+ 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870,
+ 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747,
+ 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, 375, 344,
+ 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314,
+ 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717,
+ 748, 779, 810, 841, 872, 903, 934, 965, 996, 997, 966, 935, 904,
+ 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501,
+ 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 223, 254, 285,
+ 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688,
+ 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937,
+ 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534,
+ 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380,
+ 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783,
+ 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846,
+ 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443,
+ 412, 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599,
+ 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002,
+ 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631,
+ 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539,
+ 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942,
+ 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695,
+ 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603,
+ 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006,
+ 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635,
+ 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, 791,
+ 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854,
+ 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700,
+ 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 1011, 980, 949,
+ 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733,
+ 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, 951, 920,
+ 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890,
+ 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767,
+ 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893,
+ 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895,
+ 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023};
+
+constexpr uint16_t kDefaultScan4x16[64] = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63};
+
+constexpr uint16_t kColumnScan4x16[64] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63};
+
+constexpr uint16_t kRowScan4x16[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan16x4[64] = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35,
+ 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39,
+ 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+ 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63};
+
+constexpr uint16_t kColumnScan16x4[64] = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+constexpr uint16_t kRowScan16x4[64] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x32[256] = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116,
+ 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+ 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+ 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+ 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+ 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+ 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+ 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+ 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+ 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+ 255};
+
+constexpr uint16_t kDefaultScan32x8[256] = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226,
+ 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10,
+ 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43,
+ 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76,
+ 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109,
+ 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175,
+ 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208,
+ 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241,
+ 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25,
+ 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58,
+ 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91,
+ 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124,
+ 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+ 255};
+
+// 5.11.41 (implemented as a simple look up of transform class and transform
+// size).
+const uint16_t* kScan[3][kNumTransformSizes] = {
+ // kTransformClass2D
+ {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4,
+ kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4,
+ kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32,
+ kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32,
+ kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+ // kTransformClassHorizontal
+ {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4,
+ kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4,
+ kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32,
+ kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32,
+ kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+ // kTransformClassVertical
+ {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8,
+ kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16,
+ kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4,
+ kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32,
+ kDefaultScan32x32}};
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+class ScanOrderTest
+ : public testing::TestWithParam<std::tuple<TransformClass, TransformSize>> {
+ public:
+ ScanOrderTest() = default;
+ ScanOrderTest(const ScanOrderTest&) = delete;
+ ScanOrderTest& operator=(const ScanOrderTest&) = delete;
+ ~ScanOrderTest() override = default;
+
+ protected:
+ TransformClass tx_class_ = std::get<0>(GetParam());
+ TransformSize tx_size_ = std::get<1>(GetParam());
+};
+
+TEST_P(ScanOrderTest, AllIndicesAreScannedExactlyOnce) {
+ const int tx_width = kTransformWidth[tx_size_];
+ const int tx_height = kTransformHeight[tx_size_];
+ int num_indices;
+ if (tx_class_ == kTransformClass2D || std::max(tx_width, tx_height) == 64) {
+ const int clamped_tx_width = std::min(32, tx_width);
+ const int clamped_tx_height = std::min(32, tx_height);
+ num_indices = clamped_tx_width * clamped_tx_height;
+ } else {
+ num_indices =
+ (std::max(tx_width, tx_height) > 16) ? 64 : tx_width * tx_height;
+ }
+ const uint16_t* const scan = kScan[tx_class_][tx_size_];
+ ASSERT_NE(scan, nullptr);
+ // Ensure that all the indices are scanned exactly once.
+ std::vector<int> scanned;
+ scanned.resize(num_indices);
+ for (int i = 0; i < num_indices; ++i) {
+ scanned[scan[i]]++;
+ }
+ EXPECT_THAT(scanned, testing::Each(1));
+}
+
+constexpr TransformClass kTestTransformClasses[] = {
+ kTransformClass2D, kTransformClassVertical, kTransformClassHorizontal};
+
+constexpr TransformSize kTestTransformSizes[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(
+ C, ScanOrderTest,
+ testing::Combine(testing::ValuesIn(kTestTransformClasses),
+ testing::ValuesIn(kTestTransformSizes)));
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/status_code.h"
+
+extern "C" {
+
+const char* Libgav1GetErrorString(Libgav1StatusCode status) {
+ switch (status) {
+ case kLibgav1StatusOk:
+ return "Success.";
+ case kLibgav1StatusUnknownError:
+ return "Unknown error.";
+ case kLibgav1StatusInvalidArgument:
+ return "Invalid function argument.";
+ case kLibgav1StatusOutOfMemory:
+ return "Memory allocation failure.";
+ case kLibgav1StatusResourceExhausted:
+ return "Ran out of a resource (other than memory).";
+ case kLibgav1StatusNotInitialized:
+ return "The object is not initialized.";
+ case kLibgav1StatusAlready:
+ return "An operation that can only be performed once has already been "
+ "performed.";
+ case kLibgav1StatusUnimplemented:
+ return "Not implemented.";
+ case kLibgav1StatusInternalError:
+ return "Internal error in libgav1.";
+ case kLibgav1StatusBitstreamError:
+ return "The bitstream is not encoded correctly or violates a bitstream "
+ "conformance requirement.";
+ case kLibgav1StatusTryAgain:
+ return "The operation is not allowed at the moment. Try again later.";
+ case kLibgav1StatusNothingToDequeue:
+ return "There are no enqueued frames, so there is nothing to dequeue. "
+ "Try enqueuing a frame before trying to dequeue again.";
+ // This switch statement does not have a default case. This way the compiler
+ // will warn if we neglect to update this function after adding a new value
+ // to the Libgav1StatusCode enum type.
+ case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_:
+ break;
+ }
+ return "Unrecognized status code.";
+}
+
+} // extern "C"
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/symbol_decoder_context_cdfs.inc"
+
+uint8_t GetQuantizerContext(int base_quantizer_index) {
+ if (base_quantizer_index <= 20) return 0;
+ if (base_quantizer_index <= 60) return 1;
+ if (base_quantizer_index <= 120) return 2;
+ return 3;
+}
+
+// Reset*Counters() are helper functions to reset the CDF arrays where the
+// counters are not in the last element of the innermost dimension.
+
+void ResetPartitionCounters(SymbolDecoderContext* const context) {
+ int block_size_log2 = k4x4WidthLog2[kBlock8x8];
+ for (auto& d1 : context->partition_cdf) {
+ const int cdf_size =
+ SymbolDecoderContext::PartitionCdfSize(block_size_log2++);
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) {
+ for (auto& d1 : context->palette_color_index_cdf) {
+ int cdf_size = kMinPaletteSize;
+ for (auto& d2 : d1) {
+ for (auto& d3 : d2) {
+ d3[cdf_size] = 0;
+ }
+ ++cdf_size;
+ }
+ }
+}
+
+void ResetTxTypeCounters(SymbolDecoderContext* const context) {
+ int set_index = kTransformSetIntra1;
+ for (auto& d1 : context->intra_tx_type_cdf) {
+ const int cdf_size = kNumTransformTypesInSet[set_index++];
+ for (auto& d2 : d1) {
+ for (auto& d3 : d2) {
+ d3[cdf_size] = 0;
+ }
+ }
+ }
+ for (auto& d1 : context->inter_tx_type_cdf) {
+ const int cdf_size = kNumTransformTypesInSet[set_index++];
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetTxDepthCounters(SymbolDecoderContext* const context) {
+ int delta = 1;
+ for (auto& d1 : context->tx_depth_cdf) {
+ const int cdf_size = kMaxTxDepthSymbolCount - delta;
+ delta = 0;
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ }
+}
+
+void ResetUVModeCounters(SymbolDecoderContext* const context) {
+ int cdf_size = kIntraPredictionModesUV - 1;
+ for (auto& d1 : context->uv_mode_cdf) {
+ for (auto& d2 : d1) {
+ d2[cdf_size] = 0;
+ }
+ ++cdf_size;
+ }
+}
+
+} // namespace
+
+#define CDF_COPY(source, destination) \
+ static_assert(sizeof(source) == sizeof(destination), ""); \
+ memcpy(destination, source, sizeof(source))
+
+void SymbolDecoderContext::Initialize(int base_quantizer_index) {
+ CDF_COPY(kDefaultPartitionCdf, partition_cdf);
+ CDF_COPY(kDefaultSkipCdf, skip_cdf);
+ CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf);
+ CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf);
+ CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf);
+ CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf);
+ CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf);
+ for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) {
+ CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry);
+ }
+ CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf);
+ CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+ CDF_COPY(kDefaultYModeCdf, y_mode_cdf);
+ CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf);
+ CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf);
+ CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf);
+ CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf);
+ CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf);
+ CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf);
+ CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf);
+ CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf);
+ CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf);
+ CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf);
+ CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf);
+ CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf);
+ CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf);
+ CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf);
+ CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf);
+ CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf);
+ CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf);
+ CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf);
+ CDF_COPY(kDefaultIsInterCdf, is_inter_cdf);
+ CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf);
+ CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf);
+ CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf);
+ CDF_COPY(kDefaultCompoundBackwardReferenceCdf,
+ compound_backward_reference_cdf);
+ CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf);
+ CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf);
+ CDF_COPY(kDefaultNewMvCdf, new_mv_cdf);
+ CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf);
+ CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf);
+ CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf);
+ CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf);
+ CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf);
+ CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf);
+ CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf);
+ CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf);
+ CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf);
+ CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf);
+ CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf);
+ CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf);
+ CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf);
+ for (int i = 0; i < kMvContexts; ++i) {
+ CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]);
+ for (int j = 0; j < kNumMvComponents; ++j) {
+ CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]);
+ CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]);
+ CDF_COPY(kDefaultMvClass0HighPrecisionCdf,
+ mv_class0_high_precision_cdf[i][j]);
+ CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]);
+ CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]);
+ CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]);
+ }
+ }
+ const int quantizer_context = GetQuantizerContext(base_quantizer_index);
+ CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf);
+ CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf);
+ CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf);
+ CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf);
+ CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf);
+ CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf);
+ CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf);
+ CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf);
+ CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf);
+ CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf);
+ CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf);
+ CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf);
+ CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf);
+}
+
+void SymbolDecoderContext::ResetIntraFrameYModeCdf() {
+ CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+}
+
+#undef CDF_COPY
+
+// These macros set the last element in the inner-most dimension of the array to
+// zero.
+#define RESET_COUNTER_1D(array) \
+ do { \
+ (array)[std::extent<decltype(array), 0>::value - 1] = 0; \
+ } while (false)
+
+#define RESET_COUNTER_2D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ d1[std::extent<decltype(array), 1>::value - 1] = 0; \
+ } \
+ } while (false)
+
+#define RESET_COUNTER_3D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ for (auto& d2 : d1) { \
+ d2[std::extent<decltype(array), 2>::value - 1] = 0; \
+ } \
+ } \
+ } while (false)
+
+#define RESET_COUNTER_4D(array) \
+ do { \
+ for (auto& d1 : (array)) { \
+ for (auto& d2 : d1) { \
+ for (auto& d3 : d2) { \
+ d3[std::extent<decltype(array), 3>::value - 1] = 0; \
+ } \
+ } \
+ } \
+ } while (false)
+
+void SymbolDecoderContext::ResetCounters() {
+ ResetPartitionCounters(this);
+ RESET_COUNTER_2D(segment_id_cdf);
+ RESET_COUNTER_2D(use_predicted_segment_id_cdf);
+ RESET_COUNTER_2D(skip_cdf);
+ RESET_COUNTER_2D(skip_mode_cdf);
+ RESET_COUNTER_1D(delta_q_cdf);
+ RESET_COUNTER_1D(delta_lf_cdf);
+ RESET_COUNTER_2D(delta_lf_multi_cdf);
+ RESET_COUNTER_1D(intra_block_copy_cdf);
+ RESET_COUNTER_3D(intra_frame_y_mode_cdf);
+ RESET_COUNTER_2D(y_mode_cdf);
+ RESET_COUNTER_2D(angle_delta_cdf);
+ ResetUVModeCounters(this);
+ RESET_COUNTER_1D(cfl_alpha_signs_cdf);
+ RESET_COUNTER_2D(cfl_alpha_cdf);
+ RESET_COUNTER_2D(use_filter_intra_cdf);
+ RESET_COUNTER_1D(filter_intra_mode_cdf);
+ ResetTxDepthCounters(this);
+ RESET_COUNTER_2D(tx_split_cdf);
+ RESET_COUNTER_3D(all_zero_cdf);
+ ResetTxTypeCounters(this);
+ RESET_COUNTER_3D(eob_pt_16_cdf);
+ RESET_COUNTER_3D(eob_pt_32_cdf);
+ RESET_COUNTER_3D(eob_pt_64_cdf);
+ RESET_COUNTER_3D(eob_pt_128_cdf);
+ RESET_COUNTER_3D(eob_pt_256_cdf);
+ RESET_COUNTER_2D(eob_pt_512_cdf);
+ RESET_COUNTER_2D(eob_pt_1024_cdf);
+ RESET_COUNTER_4D(eob_extra_cdf);
+ RESET_COUNTER_4D(coeff_base_eob_cdf);
+ RESET_COUNTER_4D(coeff_base_cdf);
+ RESET_COUNTER_4D(coeff_base_range_cdf);
+ RESET_COUNTER_3D(dc_sign_cdf);
+ RESET_COUNTER_1D(restoration_type_cdf);
+ RESET_COUNTER_1D(use_wiener_cdf);
+ RESET_COUNTER_1D(use_sgrproj_cdf);
+ RESET_COUNTER_3D(has_palette_y_cdf);
+ RESET_COUNTER_2D(palette_y_size_cdf);
+ RESET_COUNTER_2D(has_palette_uv_cdf);
+ RESET_COUNTER_2D(palette_uv_size_cdf);
+ ResetPaletteColorIndexCounters(this);
+ RESET_COUNTER_2D(is_inter_cdf);
+ RESET_COUNTER_2D(use_compound_reference_cdf);
+ RESET_COUNTER_2D(compound_reference_type_cdf);
+ RESET_COUNTER_4D(compound_reference_cdf);
+ RESET_COUNTER_3D(compound_backward_reference_cdf);
+ RESET_COUNTER_3D(single_reference_cdf);
+ RESET_COUNTER_2D(compound_prediction_mode_cdf);
+ RESET_COUNTER_2D(new_mv_cdf);
+ RESET_COUNTER_2D(zero_mv_cdf);
+ RESET_COUNTER_2D(reference_mv_cdf);
+ RESET_COUNTER_2D(ref_mv_index_cdf);
+ RESET_COUNTER_2D(is_inter_intra_cdf);
+ RESET_COUNTER_2D(inter_intra_mode_cdf);
+ RESET_COUNTER_2D(is_wedge_inter_intra_cdf);
+ RESET_COUNTER_2D(wedge_index_cdf);
+ RESET_COUNTER_2D(use_obmc_cdf);
+ RESET_COUNTER_2D(motion_mode_cdf);
+ RESET_COUNTER_2D(is_explicit_compound_type_cdf);
+ RESET_COUNTER_2D(is_compound_type_average_cdf);
+ RESET_COUNTER_2D(compound_type_cdf);
+ RESET_COUNTER_2D(interpolation_filter_cdf);
+ RESET_COUNTER_2D(mv_joint_cdf);
+ RESET_COUNTER_3D(mv_sign_cdf);
+ RESET_COUNTER_3D(mv_class_cdf);
+ RESET_COUNTER_3D(mv_class0_bit_cdf);
+ RESET_COUNTER_4D(mv_class0_fraction_cdf);
+ RESET_COUNTER_3D(mv_class0_high_precision_cdf);
+ RESET_COUNTER_4D(mv_bit_cdf);
+ RESET_COUNTER_3D(mv_fraction_cdf);
+ RESET_COUNTER_3D(mv_high_precision_cdf);
+}
+
+#undef RESET_COUNTER_1D
+#undef RESET_COUNTER_2D
+#undef RESET_COUNTER_3D
+#undef RESET_COUNTER_4D
+
+int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) {
+ assert(block_size_log2 > 0);
+ assert(block_size_log2 < 6);
+
+ switch (block_size_log2) {
+ case 1:
+ return kPartitionSplit + 1;
+ case 5:
+ return kPartitionVerticalWithRightSplit + 1;
+ default:
+ return kMaxPartitionTypes;
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum {
+ kPartitionContexts = 4,
+ kSegmentIdContexts = 3,
+ kUsePredictedSegmentIdContexts = 3,
+ kSkipContexts = 3,
+ kSkipModeContexts = 3,
+ kBooleanFieldCdfSize = 3,
+ kDeltaSymbolCount = 4, // Used for both delta_q and delta_lf.
+ kIntraModeContexts = 5,
+ kYModeContexts = 4,
+ kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1,
+ kCflAlphaSignsSymbolCount = 8,
+ kCflAlphaContexts = 6,
+ kCflAlphaSymbolCount = 16,
+ kTxDepthContexts = 3,
+ kMaxTxDepthSymbolCount = 3,
+ kTxSplitContexts = 21,
+ kCoefficientQuantizerContexts = 4,
+ kNumSquareTransformSizes = 5,
+ kAllZeroContexts = 13,
+ kNumExtendedTransformSizes = 4,
+ kEobPtContexts = 2,
+ kEobPt16SymbolCount = 5,
+ kEobPt32SymbolCount = 6,
+ kEobPt64SymbolCount = 7,
+ kEobPt128SymbolCount = 8,
+ kEobPt256SymbolCount = 9,
+ kEobPt512SymbolCount = 10,
+ kEobPt1024SymbolCount = 11,
+ kEobExtraContexts = 9,
+ kCoeffBaseEobContexts = 4,
+ kCoeffBaseEobSymbolCount = 3,
+ kCoeffBaseContexts = 42,
+ kCoeffBaseSymbolCount = 4,
+ kCoeffBaseRangeContexts = 21,
+ kCoeffBaseRangeSymbolCount = 4,
+ kDcSignContexts = 3,
+ kPaletteBlockSizeContexts = 7,
+ kPaletteYModeContexts = 3,
+ kPaletteUVModeContexts = 2,
+ kPaletteSizeSymbolCount = 7,
+ kPaletteColorIndexContexts = 5,
+ kPaletteColorIndexSymbolCount = 8,
+ kIsInterContexts = 4,
+ kUseCompoundReferenceContexts = 5,
+ kCompoundReferenceTypeContexts = 5,
+ kReferenceContexts = 3,
+ kCompoundPredictionModeContexts = 8,
+ kNewMvContexts = 6,
+ kZeroMvContexts = 2,
+ kReferenceMvContexts = 6,
+ kRefMvIndexContexts = 3,
+ kInterIntraContexts = 3,
+ kWedgeIndexSymbolCount = 16,
+ kIsExplicitCompoundTypeContexts = 6,
+ kIsCompoundTypeAverageContexts = 6,
+ kInterpolationFilterContexts = 16,
+ kMvContexts = 2,
+ kMvClassSymbolCount = 11,
+ kMvFractionSymbolCount = 4,
+ kMvBitSymbolCount = 10,
+ kNumMvComponents = 2,
+}; // anonymous enum
+
+struct SymbolDecoderContext {
+ SymbolDecoderContext() = default;
+ explicit SymbolDecoderContext(int base_quantizer_index) {
+ Initialize(base_quantizer_index);
+ }
+
+ void Initialize(int base_quantizer_index);
+
+ // Partition related variables and functions.
+ static int PartitionCdfSize(int block_size_log2);
+
+ // Returns the cdf array index for inter_tx_type or intra_tx_type based on
+ // |tx_set|.
+ static int TxTypeIndex(TransformSet tx_set) {
+ assert(tx_set != kTransformSetDctOnly);
+ switch (tx_set) {
+ case kTransformSetInter1:
+ case kTransformSetIntra1:
+ return 0;
+ case kTransformSetInter2:
+ case kTransformSetIntra2:
+ return 1;
+ case kTransformSetInter3:
+ return 2;
+ default:
+ return -1;
+ }
+ }
+
+ // Resets the intra_frame_y_mode_cdf array to the default.
+ void ResetIntraFrameYModeCdf();
+
+ // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is
+ // the last used element in the innermost dimension of each of the CDF array.
+ void ResetCounters();
+
+ // Note kMaxAlignment allows for aligned instructions to be used in the
+ // copies done in Initialize().
+ alignas(kMaxAlignment) uint16_t
+ partition_cdf[kBlockWidthCount][kPartitionContexts]
+ [kMaxPartitionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+ [kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+ [kIntraPredictionModesUV + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+ [kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ palette_y_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ palette_uv_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+ [kPaletteColorIndexContexts]
+ [kPaletteColorIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ use_compound_reference_cdf[kUseCompoundReferenceContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_backward_reference_cdf[kReferenceContexts][2]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+ [kNumCompoundInterPredictionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_type_cdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ interpolation_filter_cdf[kInterpolationFilterContexts]
+ [kNumExplicitInterpolationFilters + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the CDF constant
+// definitions from the symbol decoder context functions.
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
+ [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
+ // width 8
+ {{13636, 7258, 2376, 0, 0},
+ {18840, 12913, 4228, 0, 0},
+ {20246, 9089, 4139, 0, 0},
+ {22872, 13985, 6915, 0, 0}},
+ // width 16
+ {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0},
+ {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0},
+ {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0},
+ {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}},
+ // width 32
+ {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0},
+ {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0},
+ {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0},
+ {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}},
+ // width 64
+ {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0},
+ {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0},
+ {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0},
+ {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}},
+ // width 128
+ {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0},
+ {26161, 25778, 24500, 708, 549, 430, 397, 0, 0},
+ {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
+ {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+ {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+ {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+ {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
+ [kBooleanFieldCdfSize] = {{16384, 0, 0},
+ {16384, 0, 0},
+ {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+ {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
+ {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
+
+// This constant is also used for DeltaLf and DeltaLfMulti.
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
+ [kIntraPredictionModesY + 1] = {
+ {{17180, 15741, 13430, 12550, 12086, 11658,
+ 10943, 9524, 8579, 4603, 3675, 2302, 0, 0},
+ {20752, 14702, 13252, 12465, 12049, 11324,
+ 10880, 9736, 8334, 4110, 2596, 1359, 0, 0},
+ {22716, 21997, 10472, 9980, 9713, 9529, 8635,
+ 7148, 6608, 3432, 2839, 1201, 0, 0},
+ {18677, 17362, 16326, 13960, 13632, 13222,
+ 12770, 10672, 8022, 3183, 1810, 306, 0, 0},
+ {20646, 19503, 17165, 16267, 14159, 12735,
+ 10377, 7185, 6331, 2507, 1695, 293, 0, 0}},
+ {{22745, 13183, 11920, 11328, 10936, 10008,
+ 9679, 8745, 7387, 3754, 2286, 1332, 0, 0},
+ {26785, 8669, 8208, 7882, 7702, 6973, 6855,
+ 6345, 5158, 2863, 1492, 974, 0, 0},
+ {25324, 19987, 12591, 12040, 11691, 11161,
+ 10598, 9363, 8299, 4853, 3678, 2276, 0, 0},
+ {24231, 18079, 17336, 15681, 15360, 14596,
+ 14360, 12943, 8119, 3615, 1672, 558, 0, 0},
+ {25225, 18537, 17272, 16573, 14863, 12051,
+ 10784, 8252, 6767, 3093, 1787, 774, 0, 0}},
+ {{20155, 19177, 11385, 10764, 10456, 10191,
+ 9367, 7713, 7039, 3230, 2463, 691, 0, 0},
+ {23081, 19298, 14262, 13538, 13164, 12621,
+ 12073, 10706, 9549, 5025, 3557, 1861, 0, 0},
+ {26585, 26263, 6744, 6516, 6402, 6334, 5686,
+ 4414, 4213, 2301, 1974, 682, 0, 0},
+ {22050, 21034, 17814, 15544, 15203, 14844,
+ 14207, 11245, 8890, 3793, 2481, 516, 0, 0},
+ {23574, 22910, 16267, 15505, 14344, 13597,
+ 11205, 6807, 6207, 2696, 2031, 305, 0, 0}},
+ {{20166, 18369, 17280, 14387, 13990, 13453,
+ 13044, 11349, 7708, 3072, 1851, 359, 0, 0},
+ {24565, 18947, 18244, 15663, 15329, 14637,
+ 14364, 13300, 7543, 3283, 1610, 426, 0, 0},
+ {24317, 23037, 17764, 15125, 14756, 14343,
+ 13698, 11230, 8163, 3650, 2690, 750, 0, 0},
+ {25054, 23720, 23252, 16101, 15951, 15774,
+ 15615, 14001, 6025, 2379, 1232, 240, 0, 0},
+ {23925, 22488, 21272, 17451, 16116, 14825,
+ 13660, 10050, 6999, 2815, 1785, 283, 0, 0}},
+ {{20190, 19097, 16789, 15934, 13693, 11855,
+ 9779, 7319, 6549, 2554, 1618, 291, 0, 0},
+ {23205, 19142, 17688, 16876, 15012, 11905,
+ 10561, 8532, 7388, 3115, 1625, 491, 0, 0},
+ {24412, 23867, 15152, 14512, 13418, 12662,
+ 10170, 6821, 6302, 2868, 2245, 507, 0, 0},
+ {21933, 20953, 19644, 16726, 15750, 14729,
+ 13821, 10015, 8153, 3279, 1885, 286, 0, 0},
+ {25150, 24480, 22909, 22259, 17382, 14111,
+ 9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
+ {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
+ 0, 0},
+ {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784,
+ 1916, 0, 0},
+ {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038,
+ 2109, 0, 0},
+ {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
+ 4719, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
+ {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
+ {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
+ {28988, 21750, 19069, 13414, 9685, 1482, 0, 0},
+ {28187, 21542, 17621, 15630, 10934, 4371, 0, 0},
+ {31031, 21841, 18259, 13180, 10023, 3945, 0, 0},
+ {30104, 22592, 20283, 15118, 11168, 2273, 0, 0},
+ {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
+ {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
+ [kIntraPredictionModesUV + 1] = {
+ // CFL not allowed.
+ {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845,
+ 4524, 2709, 1827, 807, 0, 0},
+ {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944,
+ 4409, 3263, 2968, 972, 0, 0},
+ {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946,
+ 2914, 2004, 991, 739, 0, 0},
+ {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620,
+ 4191, 2156, 1413, 275, 0, 0},
+ {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620,
+ 4525, 1667, 1024, 405, 0, 0},
+ {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310,
+ 3057, 1607, 1327, 218, 0, 0},
+ {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568,
+ 2523, 931, 426, 101, 0, 0},
+ {19883, 19730, 17790, 17178, 17095, 17020, 16592,
+ 3640, 3501, 2125, 807, 307, 0, 0},
+ {20742, 19107, 18894, 17463, 17278, 17042, 16773,
+ 16495, 4325, 2380, 2001, 352, 0, 0},
+ {13716, 12928, 12189, 11852, 11618, 11301, 10883,
+ 10049, 9594, 3907, 2389, 593, 0, 0},
+ {14141, 13119, 11794, 11549, 11276, 10952, 10569,
+ 9649, 9241, 5715, 1371, 620, 0, 0},
+ {15742, 13764, 12771, 12429, 12182, 11665, 11419,
+ 10861, 10286, 6872, 6227, 949, 0, 0},
+ {20644, 19009, 17809, 17776, 17761, 17717, 17690,
+ 17602, 17513, 17015, 16729, 16162, 0, 0}},
+ // CFL allowed.
+ {{22361, 21560, 19868, 19587, 18945, 18593, 17869,
+ 17112, 16782, 12682, 11773, 10313, 8556, 0, 0},
+ {28236, 12988, 12711, 12553, 12340, 11697, 11569,
+ 11317, 10669, 8540, 8075, 5736, 3296, 0, 0},
+ {27495, 27389, 12591, 12498, 12383, 12329, 11819,
+ 11073, 10994, 9630, 8512, 8065, 6089, 0, 0},
+ {26028, 25601, 25106, 18616, 18232, 17983, 17734,
+ 16027, 14397, 11248, 10562, 9379, 8586, 0, 0},
+ {27781, 27400, 26840, 26700, 13654, 12453, 10911,
+ 10515, 10357, 7857, 7388, 6741, 6392, 0, 0},
+ {27398, 25879, 25521, 25375, 23270, 11654, 11366,
+ 11015, 10787, 7988, 7382, 6251, 5592, 0, 0},
+ {27952, 27807, 25564, 25442, 24003, 23838, 12599,
+ 12086, 11965, 9580, 9005, 8313, 7828, 0, 0},
+ {26160, 26028, 24239, 23719, 23511, 23412, 23033,
+ 13941, 13709, 10432, 9564, 8804, 7975, 0, 0},
+ {26770, 25349, 24987, 23835, 23513, 23219, 23015,
+ 22351, 13870, 10274, 9629, 8004, 6779, 0, 0},
+ {22108, 21470, 20218, 19811, 19446, 19144, 18728,
+ 17764, 17234, 12054, 10979, 9325, 7907, 0, 0},
+ {22246, 21238, 20216, 19805, 19390, 18989, 18523,
+ 17533, 16866, 12666, 10072, 8994, 6930, 0, 0},
+ {22669, 22077, 20129, 19719, 19382, 19103, 18643,
+ 17605, 17132, 13092, 12294, 9249, 7560, 0, 0},
+ {29624, 27681, 25386, 25264, 25175, 25078, 24967,
+ 24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+ 31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+ {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+ 0, 0},
+ {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+ 84, 0, 0},
+ {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+ 71, 0, 0},
+ {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+ 0},
+ {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+ 175, 146, 0, 0},
+ {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+ 146, 112, 108, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
+ {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
+ {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0},
+ {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+ 23819, 19992, 15557, 3210, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
+ {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
+ {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
+ {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
+ {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+ {4187, 0, 0}, {8922, 0, 0}, {11921, 0, 0}, {8453, 0, 0},
+ {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+ {21763, 0, 0}, {5589, 0, 0}, {12764, 0, 0}, {21487, 0, 0},
+ {6219, 0, 0}, {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+ {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0}, {10367, 0, 0},
+ {16680, 0, 0}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kAllZeroContexts]
+ [kBooleanFieldCdfSize] = {
+ {
+ {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0},
+ {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0},
+ {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}},
+ {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0},
+ {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0},
+ {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0},
+ {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0},
+ {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}},
+ {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0},
+ {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0},
+ {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0},
+ {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0},
+ {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0},
+ {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0},
+ {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}},
+ {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0},
+ {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0},
+ {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0},
+ {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0},
+ {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0},
+ {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0},
+ {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}},
+ {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0},
+ {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ },
+ {
+ {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0},
+ {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0},
+ {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0},
+ {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+ {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0},
+ {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0},
+ {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}},
+ {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0},
+ {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+ }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
+ 1] = {
+ {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
+ 8834, 7294, 5041, 3853, 2137, 0, 0},
+ {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360,
+ 10251, 7758, 5652, 3912, 2019, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0}},
+ {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ // Only 16x16 is used in this case.
+ {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242,
+ 2239, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+ {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
+ [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+ [kNumTransformTypes + 1] = {
+ {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
+ {32204, 29433, 23059, 21898, 14625, 4674, 0, 0},
+ {32096, 29521, 29092, 20786, 13353, 9641, 0, 0},
+ {27489, 18883, 17281, 14724, 9241, 2516, 0, 0},
+ {28345, 26694, 24783, 22352, 7075, 3470, 0, 0},
+ {31282, 28527, 23308, 22106, 16312, 5074, 0, 0},
+ {32329, 29930, 29246, 26031, 14710, 9014, 0, 0},
+ {31578, 28535, 27913, 21098, 12487, 8391, 0, 0},
+ {31723, 28456, 24121, 22609, 14124, 3433, 0, 0},
+ {32566, 29034, 28021, 25470, 15641, 8752, 0, 0},
+ {32321, 28456, 25949, 23884, 16758, 8910, 0, 0},
+ {32491, 28399, 27513, 23863, 16303, 10497, 0, 0},
+ {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}},
+ {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0},
+ {32442, 23972, 18136, 17689, 13496, 5282, 0, 0},
+ {32284, 25192, 25056, 18325, 13609, 10177, 0, 0},
+ {31642, 17428, 16873, 15745, 11872, 2489, 0, 0},
+ {32113, 27914, 27519, 26855, 10669, 5630, 0, 0},
+ {31469, 26310, 23883, 23478, 17917, 7271, 0, 0},
+ {32457, 27473, 27216, 25883, 16661, 10096, 0, 0},
+ {31885, 24709, 24498, 21510, 15479, 11219, 0, 0},
+ {32027, 25188, 23450, 22423, 16080, 3722, 0, 0},
+ {32658, 25362, 24853, 23573, 16727, 9439, 0, 0},
+ {32405, 24794, 23411, 22095, 17139, 8294, 0, 0},
+ {32615, 25121, 24656, 22832, 17461, 12772, 0, 0},
+ {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}},
+ {{{26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0}},
+ {{26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0},
+ {26214, 19661, 13107, 6554, 0, 0}},
+ {{31641, 19954, 9996, 5285, 0, 0},
+ {32623, 26007, 20788, 6101, 0, 0},
+ {32406, 26881, 21090, 16043, 0, 0},
+ {32383, 17555, 14181, 2075, 0, 0},
+ {32743, 29854, 9634, 4865, 0, 0},
+ {32708, 28298, 21019, 8777, 0, 0},
+ {32731, 29436, 18257, 11320, 0, 0},
+ {32611, 26448, 19732, 15329, 0, 0},
+ {32649, 26049, 19862, 3372, 0, 0},
+ {32721, 27231, 20192, 11269, 0, 0},
+ {32499, 26692, 21510, 9653, 0, 0},
+ {32685, 27153, 20767, 15540, 0, 0},
+ {30800, 27212, 20745, 14221, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+ {{{31928, 31729, 30788, 27873, 0, 0},
+ {32398, 32097, 30885, 28297, 0, 0}},
+ {{29521, 27818, 23080, 18205, 0, 0},
+ {30864, 29414, 25005, 18121, 0, 0}}},
+ {{{30643, 30217, 27603, 23822, 0, 0},
+ {32255, 32003, 30909, 26429, 0, 0}},
+ {{25131, 23270, 18509, 13660, 0, 0},
+ {30271, 28672, 23902, 15775, 0, 0}}},
+ {{{28752, 27871, 23887, 17800, 0, 0},
+ {32052, 31663, 30122, 22712, 0, 0}},
+ {{21629, 19498, 14527, 9202, 0, 0},
+ {29576, 27736, 22471, 13013, 0, 0}}},
+ {{{26060, 23810, 18022, 10635, 0, 0},
+ {31546, 30694, 27985, 17358, 0, 0}},
+ {{13193, 11002, 6724, 3059, 0, 0},
+ {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt32SymbolCount + 1] = {
+ {{{32368, 32248, 31791, 30666, 26226, 0, 0},
+ {32558, 32363, 31453, 29442, 25231, 0, 0}},
+ {{30132, 28495, 25180, 20974, 12367, 0, 0},
+ {30982, 29589, 25866, 21411, 13714, 0, 0}}},
+ {{{31779, 31519, 30749, 28617, 21983, 0, 0},
+ {32455, 32327, 31669, 29851, 24206, 0, 0}},
+ {{24374, 22416, 18836, 13913, 6754, 0, 0},
+ {30190, 28644, 24587, 19098, 8534, 0, 0}}},
+ {{{30253, 29765, 28316, 24606, 16727, 0, 0},
+ {32194, 31947, 30932, 27679, 19640, 0, 0}},
+ {{19300, 16465, 12407, 7663, 3487, 0, 0},
+ {29226, 27266, 22353, 16008, 7124, 0, 0}}},
+ {{{28151, 27059, 24322, 19184, 9633, 0, 0},
+ {31612, 31066, 29093, 23494, 12229, 0, 0}},
+ {{10682, 8486, 5758, 2998, 1025, 0, 0},
+ {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt64SymbolCount + 1] = {
+ {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
+ {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}},
+ {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0},
+ {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}},
+ {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0},
+ {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}},
+ {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0},
+ {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}},
+ {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0},
+ {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}},
+ {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0},
+ {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}},
+ {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0},
+ {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
+ {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
+ {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+ [kEobPt128SymbolCount + 1] = {
+ {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
+ {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}},
+ {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0},
+ {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}},
+ {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0},
+ {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}},
+ {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0},
+ {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}},
+ {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0},
+ {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}},
+ {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0},
+ {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}},
+ {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0},
+ {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}},
+ {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
+ {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+ [kEobPt256SymbolCount + 1] = {
+ {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
+ {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}},
+ {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0},
+ {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}},
+ {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0},
+ {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}},
+ {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0},
+ {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}},
+ {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0},
+ {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}},
+ {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0},
+ {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}},
+ {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0},
+ {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}},
+ {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
+ {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
+ [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
+ {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
+ {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
+ {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0},
+ {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}},
+ {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0},
+ {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}},
+ {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
+ {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPt1024SymbolCount + 1] = {
+ {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
+ 14721, 10197, 6938, 0, 0},
+ {30903, 30780, 29838, 28526, 22235, 16230, 11414,
+ 5513, 4222, 984, 0, 0}},
+ {{32072, 31820, 29623, 27066, 23062, 19551, 14917,
+ 10912, 7076, 4734, 0, 0},
+ {30096, 29177, 23438, 15684, 10043, 8484, 6241,
+ 4741, 4391, 1892, 0, 0}},
+ {{29984, 28937, 25727, 22247, 17921, 13924, 9613,
+ 6086, 3539, 1723, 0, 0},
+ {23191, 20302, 15029, 12018, 10707, 9553, 8167,
+ 7285, 6925, 712, 0, 0}},
+ {{26070, 24434, 20807, 17006, 12582, 8906, 5334,
+ 3442, 1686, 718, 0, 0},
+ {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044,
+ 2961, 198, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kNumPlaneTypes]
+ [kEobExtraContexts][kBooleanFieldCdfSize] = {
+ {
+ {
+ {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0},
+ {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0},
+ {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0},
+ {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0},
+ {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0},
+ {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0},
+ {23963, 0, 0}},
+ {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0},
+ {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0},
+ {4012, 0, 0}}
+ },
+ {
+ {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0},
+ {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0},
+ {11398, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0},
+ {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0},
+ {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0},
+ {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0},
+ {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0},
+ {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0},
+ {17138, 0, 0}},
+ {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0},
+ {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0},
+ {3771, 0, 0}}
+ },
+ {
+ {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0},
+ {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0},
+ {9684, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0},
+ {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0},
+ {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0},
+ {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0},
+ {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0},
+ {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0},
+ {15688, 0, 0}},
+ {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0},
+ {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0},
+ {8047, 0, 0}}
+ },
+ {
+ {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0},
+ {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0},
+ {10110, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ },
+ {
+ {
+ {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0},
+ {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0},
+ {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0},
+ {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}},
+ {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0},
+ {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ },
+ {
+ {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0},
+ {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0},
+ {22212, 0, 0}},
+ {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0},
+ {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0},
+ {13515, 0, 0}}
+ },
+ {
+ {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0},
+ {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0},
+ {17255, 0, 0}},
+ {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}}
+ }
+ }
+};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+ [kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseEobContexts]
+ [kCoeffBaseEobSymbolCount + 1] = {
+ {
+ {
+ {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0},
+ {7842, 3820, 0, 0}},
+ {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0},
+ {3138, 887, 0, 0}}
+ },
+ {
+ {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0},
+ {3120, 1277, 0, 0}},
+ {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0},
+ {1469, 345, 0, 0}}
+ },
+ {
+ {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0},
+ {1620, 935, 0, 0}},
+ {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0},
+ {993, 242, 0, 0}}
+ },
+ {
+ {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0},
+ {635, 199, 0, 0}},
+ {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0},
+ {3477, 174, 0, 0}}
+ },
+ {
+ {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0},
+ {356, 119, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0},
+ {5482, 2762, 0, 0}},
+ {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0},
+ {2173, 562, 0, 0}}
+ },
+ {
+ {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0},
+ {1904, 772, 0, 0}},
+ {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0},
+ {839, 174, 0, 0}}
+ },
+ {
+ {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0},
+ {602, 250, 0, 0}},
+ {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0},
+ {838, 205, 0, 0}}
+ },
+ {
+ {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0},
+ {524, 153, 0, 0}},
+ {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0},
+ {3111, 1681, 0, 0}}
+ },
+ {
+ {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0},
+ {1903, 120, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0},
+ {4102, 1898, 0, 0}},
+ {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0},
+ {1330, 262, 0, 0}}
+ },
+ {
+ {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0},
+ {1218, 584, 0, 0}},
+ {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0},
+ {701, 104, 0, 0}}
+ },
+ {
+ {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0},
+ {462, 183, 0, 0}},
+ {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0},
+ {1095, 134, 0, 0}}
+ },
+ {
+ {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0},
+ {982, 152, 0, 0}},
+ {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0},
+ {2240, 194, 0, 0}}
+ },
+ {
+ {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0},
+ {1908, 255, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ },
+ {
+ {
+ {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0},
+ {2380, 778, 0, 0}},
+ {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0},
+ {549, 66, 0, 0}}
+ },
+ {
+ {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0},
+ {1295, 553, 0, 0}},
+ {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0},
+ {526, 49, 0, 0}}
+ },
+ {
+ {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0},
+ {965, 481, 0, 0}},
+ {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0},
+ {1180, 240, 0, 0}}
+ },
+ {
+ {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0},
+ {1499, 245, 0, 0}},
+ {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0},
+ {1001, 56, 0, 0}}
+ },
+ {
+ {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0},
+ {2357, 220, 0, 0}},
+ {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}}
+ }
+ }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
+ [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
+ {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
+ {20172, 6644, 2275, 0, 0}, {23322, 11650, 5763, 0, 0},
+ {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0},
+ {12101, 2222, 839, 0, 0}, {19725, 6645, 2634, 0, 0},
+ {24617, 14011, 7990, 0, 0}, {27513, 19929, 14136, 0, 0},
+ {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {17032, 5215, 2164, 0, 0},
+ {21558, 8974, 3981, 0, 0}, {26821, 18894, 13067, 0, 0},
+ {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0},
+ {13163, 2375, 1186, 0, 0}, {19245, 6516, 2520, 0, 0},
+ {24322, 14146, 8256, 0, 0}, {28950, 22425, 16794, 0, 0},
+ {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0},
+ {17939, 5641, 2319, 0, 0}, {24455, 15066, 9464, 0, 0},
+ {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0},
+ {10414, 2994, 1396, 0, 0}, {18045, 7296, 3554, 0, 0},
+ {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0},
+ {17572, 4316, 1272, 0, 0}, {22748, 9822, 4254, 0, 0},
+ {26235, 15906, 9267, 0, 0}, {29230, 22952, 17692, 0, 0},
+ {8324, 893, 243, 0, 0}, {16887, 3844, 1133, 0, 0},
+ {22846, 9895, 4302, 0, 0}, {26241, 15802, 9077, 0, 0},
+ {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {12567, 1998, 559, 0, 0},
+ {18014, 4697, 1510, 0, 0}, {24390, 12582, 6251, 0, 0},
+ {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0},
+ {8407, 743, 187, 0, 0}, {14095, 2663, 825, 0, 0},
+ {22572, 10524, 5192, 0, 0}, {27273, 18419, 12351, 0, 0},
+ {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0},
+ {14139, 2862, 937, 0, 0}, {23404, 12044, 6453, 0, 0},
+ {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0},
+ {7335, 926, 299, 0, 0}, {13973, 3479, 1357, 0, 0},
+ {25124, 15184, 9176, 0, 0}, {29360, 23754, 17721, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0},
+ {16163, 4720, 1950, 0, 0}, {21760, 9911, 5049, 0, 0},
+ {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0},
+ {8511, 980, 269, 0, 0}, {15888, 3314, 889, 0, 0},
+ {20810, 7714, 2990, 0, 0}, {24852, 14050, 7684, 0, 0},
+ {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0},
+ {17808, 4643, 1433, 0, 0}, {23037, 10558, 4840, 0, 0},
+ {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0},
+ {12393, 2141, 637, 0, 0}, {18864, 5484, 1881, 0, 0},
+ {23400, 11210, 5624, 0, 0}, {26831, 17802, 11649, 0, 0},
+ {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0},
+ {15595, 3034, 750, 0, 0}, {19973, 7327, 2803, 0, 0},
+ {23787, 13088, 6875, 0, 0}, {28040, 21396, 15866, 0, 0},
+ {8481, 971, 329, 0, 0}, {16065, 3623, 1072, 0, 0},
+ {21935, 9214, 4043, 0, 0}, {26300, 16202, 9711, 0, 0},
+ {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0},
+ {14178, 2270, 651, 0, 0}, {20348, 7012, 2818, 0, 0},
+ {25129, 14022, 8058, 0, 0}, {29767, 24682, 20421, 0, 0},
+ {7692, 704, 188, 0, 0}, {14822, 2640, 740, 0, 0},
+ {20744, 7783, 3390, 0, 0}, {25251, 14378, 8464, 0, 0},
+ {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0},
+ {15938, 4179, 1712, 0, 0}, {22166, 9940, 5008, 0, 0},
+ {26035, 15939, 9697, 0, 0}, {29518, 23854, 19212, 0, 0},
+ {7186, 548, 100, 0, 0}, {14109, 2426, 545, 0, 0},
+ {20222, 6619, 2253, 0, 0}, {24348, 12317, 5967, 0, 0},
+ {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0},
+ {13781, 2685, 790, 0, 0}, {21441, 8520, 3684, 0, 0},
+ {25504, 15049, 8648, 0, 0}, {28773, 22000, 16599, 0, 0},
+ {6875, 937, 281, 0, 0}, {16191, 4181, 1389, 0, 0},
+ {22579, 10020, 4586, 0, 0}, {25936, 15674, 9212, 0, 0},
+ {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0},
+ {13047, 1976, 492, 0, 0}, {19949, 6525, 2357, 0, 0},
+ {24196, 12154, 5877, 0, 0}, {27404, 18709, 12301, 0, 0},
+ {6188, 330, 91, 0, 0}, {11916, 1543, 428, 0, 0},
+ {20333, 7068, 2801, 0, 0}, {24077, 11943, 5792, 0, 0},
+ {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0},
+ {11396, 1791, 496, 0, 0}, {20095, 7498, 2915, 0, 0},
+ {23560, 11843, 6128, 0, 0}, {27750, 19417, 14036, 0, 0},
+ {5417, 289, 55, 0, 0}, {11370, 1559, 381, 0, 0},
+ {20606, 7721, 2926, 0, 0}, {24872, 14077, 7449, 0, 0},
+ {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0},
+ {21731, 10815, 6292, 0, 0}, {24621, 14806, 9816, 0, 0},
+ {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0},
+ {5994, 635, 178, 0, 0}, {14924, 3204, 1001, 0, 0},
+ {21078, 8330, 3597, 0, 0}, {25226, 14553, 8309, 0, 0},
+ {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0},
+ {14117, 2642, 814, 0, 0}, {20604, 7622, 3179, 0, 0},
+ {25006, 14238, 7997, 0, 0}, {29276, 23585, 18848, 0, 0},
+ {5177, 760, 277, 0, 0}, {15619, 3915, 1258, 0, 0},
+ {21283, 8765, 3908, 0, 0}, {25071, 14682, 8558, 0, 0},
+ {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0},
+ {13137, 1717, 364, 0, 0}, {18908, 5508, 1748, 0, 0},
+ {23163, 11155, 5174, 0, 0}, {27892, 20606, 14860, 0, 0},
+ {5520, 452, 192, 0, 0}, {13813, 2311, 693, 0, 0},
+ {20944, 8771, 3973, 0, 0}, {25422, 14572, 8121, 0, 0},
+ {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0},
+ {11599, 1374, 351, 0, 0}, {19281, 5570, 1811, 0, 0},
+ {23940, 11085, 5154, 0, 0}, {28498, 21317, 15730, 0, 0},
+ {4060, 190, 37, 0, 0}, {12648, 1527, 286, 0, 0},
+ {19076, 5218, 1447, 0, 0}, {23350, 10254, 4329, 0, 0},
+ {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0},
+ {19986, 8985, 4965, 0, 0}, {23641, 12111, 6960, 0, 0},
+ {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0},
+ {2807, 205, 49, 0, 0}, {14450, 2877, 819, 0, 0},
+ {21407, 8254, 3411, 0, 0}, {24868, 13165, 7161, 0, 0},
+ {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0},
+ {14472, 2855, 959, 0, 0}, {22624, 11253, 5897, 0, 0},
+ {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0},
+ {4116, 298, 92, 0, 0}, {15230, 1997, 559, 0, 0},
+ {18844, 5886, 2274, 0, 0}, {22272, 9931, 4899, 0, 0},
+ {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0},
+ {9762, 1092, 279, 0, 0}, {18274, 4940, 1648, 0, 0},
+ {22594, 9967, 4416, 0, 0}, {26526, 17487, 11725, 0, 0},
+ {6951, 525, 48, 0, 0}, {14150, 1401, 443, 0, 0},
+ {18771, 4450, 890, 0, 0}, {20513, 6234, 1385, 0, 0},
+ {23207, 11180, 4318, 0, 0}, {4580, 133, 44, 0, 0},
+ {10708, 403, 40, 0, 0}, {14666, 2078, 240, 0, 0},
+ {18572, 3904, 769, 0, 0}, {20506, 6976, 1903, 0, 0},
+ {8592, 659, 140, 0, 0}, {14488, 3087, 805, 0, 0},
+ {22563, 9065, 3104, 0, 0}, {24879, 12743, 5092, 0, 0},
+ {26708, 16025, 8798, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0},
+ {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0},
+ {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0},
+ {6685, 1615, 332, 0, 0}, {19282, 8165, 4285, 0, 0},
+ {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0},
+ {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0},
+ {16871, 5216, 2478, 0, 0}, {24180, 12721, 7385, 0, 0},
+ {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0},
+ {4603, 571, 251, 0, 0}, {12033, 2341, 1200, 0, 0},
+ {18443, 8097, 5076, 0, 0}, {27649, 20214, 14963, 0, 0},
+ {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0},
+ {9416, 1002, 223, 0, 0}, {18099, 5198, 1709, 0, 0},
+ {24276, 11874, 5496, 0, 0}, {29124, 22574, 17564, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0},
+ {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0},
+ {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0},
+ {8781, 2066, 651, 0, 0}, {19214, 8197, 3505, 0, 0},
+ {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0},
+ {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0},
+ {18011, 2403, 814, 0, 0}, {28363, 21156, 14215, 0, 0},
+ {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0},
+ {2760, 486, 177, 0, 0}, {13524, 2660, 1020, 0, 0},
+ {21588, 8610, 3213, 0, 0}, {27118, 17796, 13559, 0, 0},
+ {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0},
+ {9756, 1104, 196, 0, 0}, {19074, 6112, 2132, 0, 0},
+ {24626, 13260, 6675, 0, 0}, {28515, 21813, 16044, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0},
+ {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0},
+ {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0},
+ {10486, 3058, 874, 0, 0}, {24260, 11842, 6784, 0, 0},
+ {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0},
+ {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0},
+ {14464, 2304, 768, 0, 0}, {21325, 6242, 3121, 0, 0},
+ {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0},
+ {1814, 591, 197, 0, 0}, {15405, 3206, 1692, 0, 0},
+ {23082, 10304, 5358, 0, 0}, {24576, 16384, 11378, 0, 0},
+ {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0},
+ {10282, 1327, 297, 0, 0}, {19935, 7141, 3030, 0, 0},
+ {25788, 15389, 9646, 0, 0}, {29657, 23881, 19289, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0},
+ {18604, 5937, 2043, 0, 0}, {23008, 12121, 6183, 0, 0},
+ {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0},
+ {9201, 1394, 514, 0, 0}, {17790, 5352, 1822, 0, 0},
+ {23334, 12543, 6514, 0, 0}, {26110, 18210, 12233, 0, 0},
+ {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {14680, 3223, 1181, 0, 0},
+ {19706, 6925, 2695, 0, 0}, {23828, 15941, 10517, 0, 0},
+ {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0},
+ {9889, 1380, 654, 0, 0}, {17553, 4775, 1813, 0, 0},
+ {23371, 13323, 7790, 0, 0}, {29326, 22955, 17424, 0, 0},
+ {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0},
+ {15996, 4805, 2050, 0, 0}, {23349, 14603, 9508, 0, 0},
+ {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0},
+ {6212, 1314, 667, 0, 0}, {15640, 5733, 2660, 0, 0},
+ {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24313, 13765, 8400, 0, 0}, {9205, 747, 164, 0, 0},
+ {16531, 3322, 833, 0, 0}, {22044, 8769, 3410, 0, 0},
+ {26043, 15240, 8352, 0, 0}, {28841, 21841, 15943, 0, 0},
+ {6455, 480, 134, 0, 0}, {15338, 2673, 673, 0, 0},
+ {21652, 8162, 3089, 0, 0}, {25573, 14384, 7499, 0, 0},
+ {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {9946, 1120, 285, 0, 0},
+ {16044, 3135, 839, 0, 0}, {22507, 9735, 4043, 0, 0},
+ {25739, 14928, 8240, 0, 0}, {27901, 18882, 11266, 0, 0},
+ {7470, 876, 277, 0, 0}, {14959, 3438, 1256, 0, 0},
+ {23100, 11439, 6189, 0, 0}, {27994, 19812, 13792, 0, 0},
+ {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0},
+ {14811, 3381, 1136, 0, 0}, {23572, 12175, 6368, 0, 0},
+ {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0},
+ {6297, 709, 194, 0, 0}, {14310, 2985, 859, 0, 0},
+ {24368, 13304, 6812, 0, 0}, {28956, 21795, 15562, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0},
+ {15152, 3721, 1396, 0, 0}, {21705, 9593, 4765, 0, 0},
+ {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0},
+ {7502, 401, 131, 0, 0}, {13714, 2215, 593, 0, 0},
+ {20629, 7556, 2961, 0, 0}, {25457, 14606, 8064, 0, 0},
+ {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0},
+ {16515, 3856, 1242, 0, 0}, {23617, 11381, 5396, 0, 0},
+ {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0},
+ {9624, 913, 325, 0, 0}, {16698, 4277, 1443, 0, 0},
+ {24066, 12301, 6251, 0, 0}, {27525, 18812, 12401, 0, 0},
+ {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0},
+ {12778, 1718, 427, 0, 0}, {19525, 6663, 2453, 0, 0},
+ {24180, 13247, 6850, 0, 0}, {28051, 21183, 15464, 0, 0},
+ {6924, 476, 186, 0, 0}, {13678, 2133, 671, 0, 0},
+ {20805, 8222, 3829, 0, 0}, {26550, 16681, 10414, 0, 0},
+ {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0},
+ {11590, 1455, 472, 0, 0}, {19282, 6584, 2898, 0, 0},
+ {25619, 14897, 9045, 0, 0}, {29935, 24810, 20509, 0, 0},
+ {5058, 240, 82, 0, 0}, {12094, 1692, 500, 0, 0},
+ {20355, 7813, 3525, 0, 0}, {26092, 15841, 9671, 0, 0},
+ {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24129, 13429, 8339, 0, 0}, {8364, 931, 243, 0, 0},
+ {15771, 3343, 984, 0, 0}, {21515, 8534, 3619, 0, 0},
+ {26017, 15374, 8740, 0, 0}, {29278, 22938, 17577, 0, 0},
+ {6485, 297, 54, 0, 0}, {13169, 1600, 326, 0, 0},
+ {19622, 5814, 1875, 0, 0}, {24554, 12180, 5878, 0, 0},
+ {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0},
+ {14174, 2452, 668, 0, 0}, {21549, 8360, 3534, 0, 0},
+ {25903, 15112, 8619, 0, 0}, {29090, 22406, 16762, 0, 0},
+ {6943, 632, 152, 0, 0}, {15455, 2915, 747, 0, 0},
+ {21571, 8297, 3296, 0, 0}, {25821, 14987, 8363, 0, 0},
+ {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0},
+ {11918, 1300, 299, 0, 0}, {18747, 5061, 1635, 0, 0},
+ {23804, 11020, 4930, 0, 0}, {27331, 18103, 11581, 0, 0},
+ {6464, 276, 70, 0, 0}, {12359, 1388, 383, 0, 0},
+ {19086, 5546, 2136, 0, 0}, {23794, 11532, 6083, 0, 0},
+ {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0},
+ {12096, 1526, 327, 0, 0}, {18596, 5514, 1866, 0, 0},
+ {22898, 10870, 5493, 0, 0}, {27604, 19262, 13498, 0, 0},
+ {6043, 309, 40, 0, 0}, {11777, 1326, 241, 0, 0},
+ {19697, 6334, 1957, 0, 0}, {24584, 12678, 6026, 0, 0},
+ {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0},
+ {18374, 6030, 2515, 0, 0}, {24355, 13214, 7573, 0, 0},
+ {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0},
+ {5992, 404, 105, 0, 0}, {14036, 2801, 837, 0, 0},
+ {21763, 8982, 3916, 0, 0}, {26302, 15859, 9258, 0, 0},
+ {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0},
+ {12700, 1911, 560, 0, 0}, {20765, 7683, 3173, 0, 0},
+ {25821, 15018, 8579, 0, 0}, {29523, 23665, 18761, 0, 0},
+ {5409, 303, 99, 0, 0}, {13347, 2154, 594, 0, 0},
+ {20853, 7758, 3189, 0, 0}, {25818, 15092, 8694, 0, 0},
+ {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0},
+ {10666, 919, 192, 0, 0}, {18360, 4759, 1363, 0, 0},
+ {23741, 11089, 4837, 0, 0}, {28074, 20090, 14020, 0, 0},
+ {4552, 240, 86, 0, 0}, {11919, 1504, 450, 0, 0},
+ {20012, 6953, 3017, 0, 0}, {25203, 13967, 7845, 0, 0},
+ {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0},
+ {9705, 858, 253, 0, 0}, {18180, 4717, 1636, 0, 0},
+ {23683, 11119, 5311, 0, 0}, {28507, 21114, 15504, 0, 0},
+ {3250, 77, 20, 0, 0}, {10317, 809, 155, 0, 0},
+ {17904, 4046, 1068, 0, 0}, {23073, 9804, 4052, 0, 0},
+ {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0},
+ {17994, 5619, 2161, 0, 0}, {23511, 11330, 5796, 0, 0},
+ {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0},
+ {5779, 506, 86, 0, 0}, {15372, 2831, 683, 0, 0},
+ {21381, 7867, 2984, 0, 0}, {25479, 13947, 7220, 0, 0},
+ {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0},
+ {15337, 3067, 865, 0, 0}, {22847, 9942, 4468, 0, 0},
+ {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0},
+ {4154, 257, 63, 0, 0}, {13404, 2130, 505, 0, 0},
+ {19639, 6514, 2366, 0, 0}, {24014, 12284, 6328, 0, 0},
+ {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0},
+ {10988, 1165, 267, 0, 0}, {18454, 4939, 1477, 0, 0},
+ {23157, 10441, 4505, 0, 0}, {27878, 19681, 13703, 0, 0},
+ {6906, 201, 35, 0, 0}, {11974, 718, 201, 0, 0},
+ {15525, 2143, 514, 0, 0}, {19485, 5140, 1294, 0, 0},
+ {23099, 10236, 3850, 0, 0}, {5333, 71, 20, 0, 0},
+ {7846, 378, 54, 0, 0}, {11319, 1264, 232, 0, 0},
+ {16376, 3039, 936, 0, 0}, {21076, 7884, 3692, 0, 0},
+ {8575, 478, 33, 0, 0}, {13859, 1664, 205, 0, 0},
+ {20532, 5927, 1365, 0, 0}, {24597, 10928, 3686, 0, 0},
+ {25544, 15488, 7493, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0},
+ {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0},
+ {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0},
+ {6951, 1122, 239, 0, 0}, {19060, 6430, 2383, 0, 0},
+ {25440, 14183, 7898, 0, 0}, {28077, 19688, 13492, 0, 0},
+ {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0},
+ {15608, 3767, 1408, 0, 0}, {23166, 10906, 5372, 0, 0},
+ {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0},
+ {3522, 318, 105, 0, 0}, {14072, 2839, 950, 0, 0},
+ {22258, 9399, 4208, 0, 0}, {26539, 16269, 9643, 0, 0},
+ {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0},
+ {11130, 1281, 265, 0, 0}, {19831, 5914, 1898, 0, 0},
+ {24586, 12172, 5798, 0, 0}, {29131, 22499, 17271, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0},
+ {25012, 14451, 9033, 0, 0}, {29316, 23512, 19622, 0, 0},
+ {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0},
+ {5717, 910, 237, 0, 0}, {16780, 5237, 2149, 0, 0},
+ {23580, 11284, 6049, 0, 0}, {26495, 15582, 8968, 0, 0},
+ {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0},
+ {14649, 2731, 918, 0, 0}, {22524, 9799, 5296, 0, 0},
+ {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0},
+ {2708, 187, 48, 0, 0}, {11757, 1993, 648, 0, 0},
+ {20837, 7948, 3479, 0, 0}, {25649, 15106, 8412, 0, 0},
+ {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0},
+ {8855, 1044, 279, 0, 0}, {17248, 4708, 1482, 0, 0},
+ {21251, 9760, 4197, 0, 0}, {26575, 18260, 12139, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0},
+ {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0},
+ {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0},
+ {9942, 2349, 633, 0, 0}, {22373, 11006, 5826, 0, 0},
+ {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0},
+ {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0},
+ {15853, 5014, 2395, 0, 0}, {23620, 11778, 6337, 0, 0},
+ {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0},
+ {2166, 291, 98, 0, 0}, {12742, 2813, 1200, 0, 0},
+ {21548, 9140, 4663, 0, 0}, {26116, 15749, 9795, 0, 0},
+ {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0},
+ {10538, 1881, 395, 0, 0}, {20534, 7689, 3037, 0, 0},
+ {25442, 13952, 7415, 0, 0}, {28835, 21861, 16152, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0},
+ {17595, 5013, 1447, 0, 0}, {22610, 11535, 5386, 0, 0},
+ {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0},
+ {7933, 759, 272, 0, 0}, {16259, 4347, 1189, 0, 0},
+ {21811, 11254, 5350, 0, 0}, {24887, 16838, 10672, 0, 0},
+ {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {12023, 1995, 675, 0, 0},
+ {17568, 5547, 1907, 0, 0}, {19736, 11895, 7101, 0, 0},
+ {20483, 14105, 9274, 0, 0}, {21205, 15287, 11279, 0, 0},
+ {6508, 786, 448, 0, 0}, {17371, 4685, 1668, 0, 0},
+ {23026, 13551, 7944, 0, 0}, {29507, 23139, 17406, 0, 0},
+ {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0},
+ {15911, 5109, 1994, 0, 0}, {23217, 14478, 9020, 0, 0},
+ {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0},
+ {3763, 753, 376, 0, 0}, {15091, 5074, 1905, 0, 0},
+ {23564, 15412, 9549, 0, 0}, {30365, 25252, 19954, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21960, 10712, 5872, 0, 0}, {7029, 455, 92, 0, 0},
+ {15480, 2565, 547, 0, 0}, {21409, 7890, 2872, 0, 0},
+ {25819, 15001, 7875, 0, 0}, {28481, 20972, 14697, 0, 0},
+ {4888, 247, 63, 0, 0}, {13730, 1764, 354, 0, 0},
+ {20204, 6423, 2000, 0, 0}, {24499, 12821, 5989, 0, 0},
+ {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {7026, 449, 97, 0, 0},
+ {13211, 1604, 314, 0, 0}, {19387, 6387, 2013, 0, 0},
+ {22667, 11302, 6046, 0, 0}, {23559, 13118, 5943, 0, 0},
+ {5661, 851, 336, 0, 0}, {14712, 3875, 1565, 0, 0},
+ {22568, 11334, 6004, 0, 0}, {28108, 19855, 13266, 0, 0},
+ {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0},
+ {14140, 2763, 737, 0, 0}, {22535, 10326, 4536, 0, 0},
+ {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0},
+ {5072, 328, 76, 0, 0}, {12736, 1601, 330, 0, 0},
+ {24068, 11427, 4326, 0, 0}, {27106, 17937, 10973, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0},
+ {14386, 3148, 949, 0, 0}, {21877, 9293, 4045, 0, 0},
+ {26410, 16185, 9459, 0, 0}, {29520, 23650, 18627, 0, 0},
+ {5564, 195, 69, 0, 0}, {12950, 1944, 439, 0, 0},
+ {20996, 7648, 2727, 0, 0}, {25773, 14735, 7729, 0, 0},
+ {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0},
+ {17412, 4369, 1293, 0, 0}, {23947, 12133, 5711, 0, 0},
+ {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0},
+ {7893, 648, 239, 0, 0}, {17535, 4503, 1323, 0, 0},
+ {24163, 12198, 5836, 0, 0}, {27337, 18355, 11572, 0, 0},
+ {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0},
+ {11727, 1322, 312, 0, 0}, {19547, 6555, 2293, 0, 0},
+ {24513, 13383, 6731, 0, 0}, {27838, 20183, 13938, 0, 0},
+ {4000, 320, 141, 0, 0}, {13063, 2207, 747, 0, 0},
+ {21196, 9179, 4548, 0, 0}, {27236, 17734, 11322, 0, 0},
+ {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0},
+ {11147, 1697, 567, 0, 0}, {20257, 8021, 3776, 0, 0},
+ {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0},
+ {3053, 143, 56, 0, 0}, {11810, 1757, 485, 0, 0},
+ {21535, 9097, 3962, 0, 0}, {26756, 16640, 9900, 0, 0},
+ {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21752, 10657, 5974, 0, 0}, {6822, 411, 91, 0, 0},
+ {14878, 2316, 516, 0, 0}, {21090, 7626, 2952, 0, 0},
+ {26048, 15234, 8184, 0, 0}, {28538, 21103, 14948, 0, 0},
+ {4368, 145, 21, 0, 0}, {11604, 1100, 193, 0, 0},
+ {19196, 5380, 1586, 0, 0}, {24534, 12018, 5410, 0, 0},
+ {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0},
+ {14087, 2225, 529, 0, 0}, {21849, 8693, 3482, 0, 0},
+ {26337, 15569, 8691, 0, 0}, {28949, 22304, 16150, 0, 0},
+ {5898, 301, 75, 0, 0}, {13727, 1937, 421, 0, 0},
+ {20974, 7557, 2752, 0, 0}, {25880, 14749, 7798, 0, 0},
+ {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0},
+ {9609, 761, 155, 0, 0}, {17453, 4099, 1092, 0, 0},
+ {23470, 10161, 3986, 0, 0}, {26624, 16855, 9800, 0, 0},
+ {4658, 269, 99, 0, 0}, {11194, 1831, 753, 0, 0},
+ {20009, 7950, 4041, 0, 0}, {26223, 16007, 9726, 0, 0},
+ {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0},
+ {10667, 1299, 304, 0, 0}, {19608, 7296, 2625, 0, 0},
+ {25465, 14084, 7300, 0, 0}, {27527, 18793, 11813, 0, 0},
+ {4368, 137, 24, 0, 0}, {10664, 975, 165, 0, 0},
+ {19211, 6197, 1922, 0, 0}, {25019, 12907, 6093, 0, 0},
+ {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0},
+ {17730, 4916, 1762, 0, 0}, {24050, 12204, 6282, 0, 0},
+ {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0},
+ {5217, 264, 67, 0, 0}, {14458, 2714, 668, 0, 0},
+ {22557, 9348, 3686, 0, 0}, {26546, 15892, 8852, 0, 0},
+ {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0},
+ {12885, 2017, 567, 0, 0}, {21627, 8584, 3483, 0, 0},
+ {26348, 15828, 8994, 0, 0}, {29376, 23015, 17650, 0, 0},
+ {4303, 152, 56, 0, 0}, {12918, 2066, 524, 0, 0},
+ {21785, 8744, 3545, 0, 0}, {26474, 15998, 9186, 0, 0},
+ {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0},
+ {9828, 736, 142, 0, 0}, {18486, 4840, 1295, 0, 0},
+ {24206, 11441, 4854, 0, 0}, {27922, 19375, 12849, 0, 0},
+ {2787, 178, 73, 0, 0}, {12303, 1805, 602, 0, 0},
+ {21289, 9189, 4573, 0, 0}, {26852, 17120, 10695, 0, 0},
+ {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0},
+ {9662, 1044, 324, 0, 0}, {18985, 6030, 2329, 0, 0},
+ {24916, 13300, 6961, 0, 0}, {28908, 21644, 15915, 0, 0},
+ {1754, 44, 20, 0, 0}, {9139, 659, 140, 0, 0},
+ {18021, 4653, 1365, 0, 0}, {24223, 11526, 5290, 0, 0},
+ {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{23583, 13074, 8080, 0, 0}, {6687, 783, 147, 0, 0},
+ {16753, 3768, 981, 0, 0}, {22226, 9078, 3562, 0, 0},
+ {26036, 14823, 8091, 0, 0}, {28852, 21729, 16046, 0, 0},
+ {4544, 202, 24, 0, 0}, {13668, 1630, 283, 0, 0},
+ {20240, 6148, 1889, 0, 0}, {25027, 12491, 5883, 0, 0},
+ {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0},
+ {15098, 2435, 613, 0, 0}, {22383, 9168, 3859, 0, 0},
+ {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0},
+ {4391, 207, 30, 0, 0}, {13402, 1593, 286, 0, 0},
+ {19441, 5593, 1674, 0, 0}, {24510, 11999, 5625, 0, 0},
+ {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0},
+ {9915, 866, 185, 0, 0}, {18009, 4582, 1349, 0, 0},
+ {23484, 10386, 4420, 0, 0}, {27183, 17576, 10900, 0, 0},
+ {4477, 116, 22, 0, 0}, {12919, 661, 197, 0, 0},
+ {17934, 5950, 3554, 0, 0}, {22462, 10174, 4096, 0, 0},
+ {26153, 15384, 9384, 0, 0}, {3821, 164, 23, 0, 0},
+ {7143, 479, 122, 0, 0}, {14010, 4096, 1365, 0, 0},
+ {22751, 9338, 4245, 0, 0}, {25906, 17499, 10637, 0, 0},
+ {8835, 259, 29, 0, 0}, {12841, 1273, 137, 0, 0},
+ {20865, 6745, 2147, 0, 0}, {25742, 12674, 5516, 0, 0},
+ {26770, 14662, 8331, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0},
+ {21595, 8994, 4201, 0, 0}, {25486, 14475, 8505, 0, 0},
+ {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0},
+ {6117, 632, 121, 0, 0}, {18138, 4514, 1313, 0, 0},
+ {24052, 11481, 5373, 0, 0}, {27153, 17437, 10760, 0, 0},
+ {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0},
+ {16642, 3786, 1135, 0, 0}, {23738, 11407, 5416, 0, 0},
+ {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0},
+ {3229, 167, 38, 0, 0}, {14643, 2383, 567, 0, 0},
+ {22346, 8678, 3300, 0, 0}, {26300, 15281, 8330, 0, 0},
+ {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0},
+ {12102, 1395, 271, 0, 0}, {20259, 6128, 1851, 0, 0},
+ {24710, 12139, 5478, 0, 0}, {28537, 20762, 14716, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{22566, 12135, 7284, 0, 0}, {5432, 1323, 416, 0, 0},
+ {20348, 8384, 4216, 0, 0}, {25120, 14653, 8912, 0, 0},
+ {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0},
+ {1823, 152, 32, 0, 0}, {14086, 2263, 515, 0, 0},
+ {21255, 7432, 2565, 0, 0}, {25319, 13316, 6620, 0, 0},
+ {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0},
+ {14190, 2267, 622, 0, 0}, {21519, 9400, 4137, 0, 0},
+ {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0},
+ {1411, 58, 20, 0, 0}, {11216, 1274, 264, 0, 0},
+ {18877, 5091, 1428, 0, 0}, {23717, 10670, 4596, 0, 0},
+ {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0},
+ {7929, 861, 217, 0, 0}, {15608, 3989, 1072, 0, 0},
+ {20316, 8631, 3166, 0, 0}, {26603, 17379, 10291, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0},
+ {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0},
+ {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0},
+ {8048, 1529, 309, 0, 0}, {20183, 7412, 2800, 0, 0},
+ {25587, 14522, 8324, 0, 0}, {27743, 19101, 12883, 0, 0},
+ {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0},
+ {15347, 3612, 1193, 0, 0}, {22879, 10580, 4986, 0, 0},
+ {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0},
+ {2585, 200, 55, 0, 0}, {14240, 2573, 719, 0, 0},
+ {21786, 8162, 3111, 0, 0}, {25811, 14603, 7537, 0, 0},
+ {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0},
+ {11727, 1440, 222, 0, 0}, {20200, 6036, 1602, 0, 0},
+ {24716, 12048, 5035, 0, 0}, {28432, 20576, 14372, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+ {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0},
+ {19093, 4727, 989, 0, 0}, {24178, 12094, 5137, 0, 0},
+ {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0},
+ {6275, 350, 110, 0, 0}, {16392, 3426, 678, 0, 0},
+ {22174, 10119, 3798, 0, 0}, {24592, 15598, 8465, 0, 0},
+ {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {8880, 866, 226, 0, 0},
+ {14156, 3081, 781, 0, 0}, {16523, 7916, 3519, 0, 0},
+ {17003, 10160, 5209, 0, 0}, {12873, 8069, 5258, 0, 0},
+ {4367, 556, 311, 0, 0}, {17494, 4943, 1788, 0, 0},
+ {23404, 14640, 8436, 0, 0}, {30485, 24575, 17686, 0, 0},
+ {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0},
+ {14787, 4523, 1380, 0, 0}, {21847, 12670, 6528, 0, 0},
+ {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0},
+ {1700, 302, 133, 0, 0}, {12447, 3196, 797, 0, 0},
+ {21997, 12513, 5649, 0, 0}, {29973, 22358, 15407, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{23448, 10666, 4928, 0, 0}, {5711, 304, 44, 0, 0},
+ {16437, 2500, 459, 0, 0}, {22449, 8833, 3048, 0, 0},
+ {26579, 16320, 8662, 0, 0}, {29179, 21884, 13960, 0, 0},
+ {3742, 144, 20, 0, 0}, {13542, 1261, 181, 0, 0},
+ {20076, 5847, 1565, 0, 0}, {25719, 13236, 5133, 0, 0},
+ {25041, 17099, 9516, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {4712, 143, 20, 0, 0},
+ {10385, 693, 99, 0, 0}, {17351, 5670, 1019, 0, 0},
+ {14641, 6275, 5578, 0, 0}, {27307, 16384, 10923, 0, 0},
+ {4786, 677, 184, 0, 0}, {13723, 2900, 796, 0, 0},
+ {22371, 10502, 4836, 0, 0}, {26778, 19071, 11268, 0, 0},
+ {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0},
+ {11234, 1247, 199, 0, 0}, {21659, 7551, 2751, 0, 0},
+ {27097, 17644, 6617, 0, 0}, {28087, 18725, 14043, 0, 0},
+ {4080, 188, 27, 0, 0}, {10192, 689, 107, 0, 0},
+ {22141, 10627, 4428, 0, 0}, {23406, 18725, 4681, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0},
+ {17105, 3543, 774, 0, 0}, {22890, 9480, 3610, 0, 0},
+ {26349, 15680, 8432, 0, 0}, {28909, 21765, 15729, 0, 0},
+ {5206, 173, 43, 0, 0}, {15193, 2180, 369, 0, 0},
+ {21949, 7930, 2459, 0, 0}, {25644, 14082, 6852, 0, 0},
+ {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0},
+ {17462, 3763, 830, 0, 0}, {23831, 11153, 4446, 0, 0},
+ {26786, 17165, 9982, 0, 0}, {29148, 22501, 16632, 0, 0},
+ {5488, 304, 101, 0, 0}, {17161, 3608, 764, 0, 0},
+ {23677, 10633, 4028, 0, 0}, {26536, 16136, 8748, 0, 0},
+ {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0},
+ {13118, 1548, 306, 0, 0}, {19718, 6456, 1941, 0, 0},
+ {23540, 11898, 5300, 0, 0}, {26622, 17619, 10797, 0, 0},
+ {2599, 287, 145, 0, 0}, {15556, 3457, 1214, 0, 0},
+ {22857, 11457, 5886, 0, 0}, {28281, 19454, 12396, 0, 0},
+ {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0},
+ {13278, 2562, 661, 0, 0}, {21536, 8770, 3492, 0, 0},
+ {25999, 14813, 7733, 0, 0}, {28370, 20145, 13554, 0, 0},
+ {2159, 141, 46, 0, 0}, {13398, 2186, 481, 0, 0},
+ {22311, 9149, 3359, 0, 0}, {26325, 15131, 7934, 0, 0},
+ {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24142, 12497, 6552, 0, 0}, {6061, 362, 57, 0, 0},
+ {15769, 2439, 482, 0, 0}, {21323, 7645, 2482, 0, 0},
+ {26357, 13940, 7167, 0, 0}, {25967, 20310, 12520, 0, 0},
+ {2850, 86, 20, 0, 0}, {12119, 1029, 150, 0, 0},
+ {19889, 4995, 1187, 0, 0}, {24872, 11017, 4524, 0, 0},
+ {27508, 17898, 9070, 0, 0}, {3516, 175, 37, 0, 0},
+ {15696, 2308, 474, 0, 0}, {22115, 8625, 3403, 0, 0},
+ {26232, 15278, 8785, 0, 0}, {27839, 19598, 12683, 0, 0},
+ {4631, 250, 53, 0, 0}, {14597, 1984, 361, 0, 0},
+ {21331, 7332, 2309, 0, 0}, {25516, 14234, 6592, 0, 0},
+ {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0},
+ {9751, 546, 67, 0, 0}, {17139, 3535, 722, 0, 0},
+ {23381, 10147, 3288, 0, 0}, {25846, 15152, 7758, 0, 0},
+ {3930, 503, 154, 0, 0}, {13067, 2562, 848, 0, 0},
+ {21554, 10358, 4835, 0, 0}, {27448, 18591, 9734, 0, 0},
+ {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0},
+ {11692, 1242, 207, 0, 0}, {20061, 6465, 1557, 0, 0},
+ {24599, 11046, 4549, 0, 0}, {26723, 13362, 5726, 0, 0},
+ {5015, 196, 23, 0, 0}, {11936, 890, 115, 0, 0},
+ {19518, 5412, 1094, 0, 0}, {25050, 11260, 2910, 0, 0},
+ {25559, 14418, 7209, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0},
+ {18239, 4809, 1317, 0, 0}, {24495, 11950, 5510, 0, 0},
+ {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0},
+ {4752, 194, 36, 0, 0}, {15297, 2462, 467, 0, 0},
+ {22544, 8705, 3040, 0, 0}, {26166, 14814, 7716, 0, 0},
+ {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0},
+ {15271, 2486, 498, 0, 0}, {22539, 9039, 3230, 0, 0},
+ {26424, 15557, 8328, 0, 0}, {28919, 21579, 15660, 0, 0},
+ {4198, 185, 42, 0, 0}, {15247, 2607, 530, 0, 0},
+ {22615, 9203, 3390, 0, 0}, {26313, 15427, 8325, 0, 0},
+ {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0},
+ {11222, 928, 158, 0, 0}, {19221, 5187, 1309, 0, 0},
+ {23856, 11011, 4459, 0, 0}, {27220, 17688, 10722, 0, 0},
+ {1985, 228, 83, 0, 0}, {15228, 3240, 1100, 0, 0},
+ {22608, 11300, 5985, 0, 0}, {28044, 19375, 12714, 0, 0},
+ {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0},
+ {11814, 1674, 431, 0, 0}, {20348, 7070, 2589, 0, 0},
+ {25464, 13448, 6520, 0, 0}, {28402, 20507, 13904, 0, 0},
+ {1187, 45, 20, 0, 0}, {11395, 1182, 243, 0, 0},
+ {20024, 6143, 1883, 0, 0}, {25337, 12446, 5818, 0, 0},
+ {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24935, 14399, 8673, 0, 0}, {6118, 495, 66, 0, 0},
+ {16397, 2807, 577, 0, 0}, {21713, 8686, 3139, 0, 0},
+ {25876, 14124, 7368, 0, 0}, {27762, 19711, 13528, 0, 0},
+ {2934, 102, 20, 0, 0}, {13191, 1433, 198, 0, 0},
+ {20515, 6259, 1646, 0, 0}, {24777, 11996, 5057, 0, 0},
+ {27091, 16858, 9709, 0, 0}, {2659, 236, 48, 0, 0},
+ {16021, 2602, 516, 0, 0}, {22634, 9226, 3584, 0, 0},
+ {26977, 16592, 9212, 0, 0}, {28406, 22354, 15484, 0, 0},
+ {3276, 142, 20, 0, 0}, {12874, 1366, 243, 0, 0},
+ {19826, 5697, 1899, 0, 0}, {24422, 11552, 5363, 0, 0},
+ {26196, 15681, 8909, 0, 0}, {733, 33, 20, 0, 0},
+ {9811, 930, 150, 0, 0}, {18044, 4196, 996, 0, 0},
+ {22404, 8769, 3215, 0, 0}, {25764, 14335, 7113, 0, 0},
+ {5240, 491, 87, 0, 0}, {15809, 1597, 672, 0, 0},
+ {22282, 9175, 4806, 0, 0}, {24576, 16384, 9557, 0, 0},
+ {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0},
+ {17695, 3277, 1092, 0, 0}, {21504, 8192, 4096, 0, 0},
+ {30427, 14043, 9362, 0, 0}, {25486, 14564, 7282, 0, 0},
+ {4221, 555, 111, 0, 0}, {11980, 2995, 529, 0, 0},
+ {25988, 11299, 2260, 0, 0}, {26810, 17873, 8937, 0, 0},
+ {16384, 10923, 5461, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0},
+ {19312, 5606, 1681, 0, 0}, {24767, 12706, 6264, 0, 0},
+ {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0},
+ {5734, 424, 59, 0, 0}, {16918, 3353, 771, 0, 0},
+ {23274, 9992, 3927, 0, 0}, {26617, 15938, 8799, 0, 0},
+ {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0},
+ {17130, 3346, 823, 0, 0}, {23618, 10903, 4550, 0, 0},
+ {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0},
+ {4238, 182, 33, 0, 0}, {15629, 2470, 476, 0, 0},
+ {22568, 8729, 3083, 0, 0}, {26349, 15094, 7982, 0, 0},
+ {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0},
+ {12150, 1281, 224, 0, 0}, {19867, 5551, 1536, 0, 0},
+ {24144, 11034, 4597, 0, 0}, {27664, 18577, 12020, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{21562, 11678, 6207, 0, 0}, {4009, 489, 97, 0, 0},
+ {18597, 4816, 1199, 0, 0}, {23025, 9861, 3627, 0, 0},
+ {25897, 14882, 7900, 0, 0}, {27808, 19616, 13453, 0, 0},
+ {1691, 107, 20, 0, 0}, {13368, 1573, 253, 0, 0},
+ {20016, 5910, 1728, 0, 0}, {24398, 10670, 4177, 0, 0},
+ {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0},
+ {14908, 2111, 435, 0, 0}, {20258, 7956, 3507, 0, 0},
+ {26588, 13644, 8046, 0, 0}, {27727, 19220, 14809, 0, 0},
+ {1216, 52, 20, 0, 0}, {10860, 999, 145, 0, 0},
+ {18298, 4567, 1203, 0, 0}, {23275, 9786, 4160, 0, 0},
+ {25910, 15528, 8631, 0, 0}, {225, 16, 12, 0, 0},
+ {8482, 671, 102, 0, 0}, {16810, 3551, 744, 0, 0},
+ {22561, 8534, 2810, 0, 0}, {25839, 14463, 7116, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+ {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0},
+ {22564, 9972, 4477, 0, 0}, {26692, 16833, 10643, 0, 0},
+ {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0},
+ {6928, 752, 106, 0, 0}, {17659, 4500, 1237, 0, 0},
+ {23383, 10537, 4428, 0, 0}, {26686, 16096, 9289, 0, 0},
+ {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0},
+ {15932, 3216, 909, 0, 0}, {23212, 10226, 4412, 0, 0},
+ {26463, 16043, 9228, 0, 0}, {29392, 22873, 17584, 0, 0},
+ {3385, 151, 23, 0, 0}, {13877, 1959, 367, 0, 0},
+ {21080, 6826, 2081, 0, 0}, {25300, 13299, 6117, 0, 0},
+ {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0},
+ {11862, 1157, 168, 0, 0}, {19577, 5147, 1231, 0, 0},
+ {24000, 10739, 4092, 0, 0}, {27689, 18659, 11862, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
+ [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
+ {{{{18470, 12050, 8594, 0, 0}, {20232, 13167, 8979, 0, 0},
+ {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0},
+ {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0},
+ {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0},
+ {16790, 9550, 5950, 0, 0}, {20581, 13294, 8879, 0, 0},
+ {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0},
+ {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0},
+ {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0},
+ {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0},
+ {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0},
+ {29734, 26908, 24306, 0, 0}},
+ {{16801, 9863, 6482, 0, 0}, {19234, 12114, 8189, 0, 0},
+ {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0},
+ {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0},
+ {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0},
+ {13228, 6064, 3049, 0, 0}, {17610, 9799, 5671, 0, 0},
+ {21360, 13903, 9118, 0, 0}, {23883, 17320, 12518, 0, 0},
+ {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0},
+ {12945, 6278, 3612, 0, 0}, {13878, 6839, 3836, 0, 0},
+ {17108, 9277, 5335, 0, 0}, {20621, 12992, 8280, 0, 0},
+ {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0},
+ {27328, 22598, 18583, 0, 0}}},
+ {{{18362, 11906, 8354, 0, 0}, {20944, 13861, 9659, 0, 0},
+ {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0},
+ {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0},
+ {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0},
+ {17175, 9869, 6059, 0, 0}, {20666, 13400, 8957, 0, 0},
+ {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0},
+ {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0},
+ {16983, 11183, 8409, 0, 0}, {14421, 7539, 4502, 0, 0},
+ {17794, 10281, 6379, 0, 0}, {21345, 14087, 9497, 0, 0},
+ {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0},
+ {29061, 25732, 22786, 0, 0}},
+ {{17308, 11072, 7299, 0, 0}, {20598, 13519, 9577, 0, 0},
+ {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0},
+ {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0},
+ {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0},
+ {14845, 7479, 3976, 0, 0}, {18490, 10800, 6471, 0, 0},
+ {21858, 14632, 9818, 0, 0}, {24345, 17953, 13141, 0, 0},
+ {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0},
+ {12916, 6694, 4096, 0, 0}, {13397, 6658, 3779, 0, 0},
+ {16503, 8895, 5105, 0, 0}, {20010, 12390, 7816, 0, 0},
+ {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0},
+ {27563, 23023, 19146, 0, 0}}},
+ {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0},
+ {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0},
+ {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0},
+ {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0},
+ {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0},
+ {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0},
+ {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0},
+ {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0},
+ {16816, 9228, 5514, 0, 0}, {20359, 12834, 8338, 0, 0},
+ {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0},
+ {28630, 24974, 21807, 0, 0}},
+ {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0},
+ {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0},
+ {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0},
+ {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0},
+ {20784, 13660, 9648, 0, 0}, {22078, 15558, 11105, 0, 0},
+ {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0},
+ {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0},
+ {13032, 6121, 3627, 0, 0}, {13835, 6698, 3784, 0, 0},
+ {16989, 9720, 5568, 0, 0}, {20130, 12707, 8236, 0, 0},
+ {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0},
+ {27690, 23484, 20174, 0, 0}}},
+ {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0},
+ {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0},
+ {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0},
+ {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0},
+ {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0},
+ {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0},
+ {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0},
+ {14488, 8381, 4779, 0, 0}, {16916, 10097, 6583, 0, 0},
+ {18923, 11817, 7979, 0, 0}, {21713, 14802, 10639, 0, 0},
+ {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0},
+ {29398, 26375, 23755, 0, 0}},
+ {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0},
+ {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0},
+ {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0},
+ {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0},
+ {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0},
+ {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0},
+ {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0},
+ {17363, 11575, 7149, 0, 0}, {17077, 10816, 6207, 0, 0},
+ {19806, 13574, 8603, 0, 0}, {22496, 14913, 10639, 0, 0},
+ {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0},
+ {27898, 23132, 19563, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{17773, 11427, 8019, 0, 0}, {19610, 12479, 8167, 0, 0},
+ {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0},
+ {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0},
+ {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0},
+ {15962, 8606, 5235, 0, 0}, {19868, 12364, 8055, 0, 0},
+ {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0},
+ {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0},
+ {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0},
+ {20698, 13499, 9144, 0, 0}, {23815, 17362, 12662, 0, 0},
+ {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0},
+ {28975, 25490, 22321, 0, 0}},
+ {{17197, 10536, 7019, 0, 0}, {18262, 11193, 7394, 0, 0},
+ {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0},
+ {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0},
+ {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0},
+ {11993, 5102, 2478, 0, 0}, {16294, 8358, 4469, 0, 0},
+ {20297, 12588, 7781, 0, 0}, {23358, 16281, 11329, 0, 0},
+ {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0},
+ {11678, 5478, 3012, 0, 0}, {11972, 5366, 2742, 0, 0},
+ {14949, 7283, 3799, 0, 0}, {18908, 10859, 6306, 0, 0},
+ {21766, 14274, 9239, 0, 0}, {23815, 16839, 11871, 0, 0},
+ {26320, 20850, 16314, 0, 0}}},
+ {{{16769, 10560, 7319, 0, 0}, {19718, 12780, 8646, 0, 0},
+ {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0},
+ {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0},
+ {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0},
+ {15943, 8533, 5010, 0, 0}, {19895, 12366, 7958, 0, 0},
+ {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0},
+ {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0},
+ {17007, 12052, 9544, 0, 0}, {13450, 6779, 4009, 0, 0},
+ {17239, 9674, 5839, 0, 0}, {21106, 13779, 9127, 0, 0},
+ {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0},
+ {28520, 24709, 21328, 0, 0}},
+ {{17869, 11551, 8265, 0, 0}, {19249, 12485, 8721, 0, 0},
+ {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0},
+ {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0},
+ {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0},
+ {12533, 5622, 2846, 0, 0}, {16872, 9053, 5131, 0, 0},
+ {20928, 13418, 8637, 0, 0}, {23646, 16836, 11888, 0, 0},
+ {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0},
+ {11923, 6215, 3836, 0, 0}, {11787, 5396, 2884, 0, 0},
+ {14987, 7433, 3983, 0, 0}, {19008, 11060, 6471, 0, 0},
+ {21793, 14353, 9403, 0, 0}, {23723, 16979, 12082, 0, 0},
+ {26638, 21569, 17345, 0, 0}}},
+ {{{19219, 13044, 9610, 0, 0}, {20924, 14386, 10522, 0, 0},
+ {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0},
+ {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0},
+ {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0},
+ {17829, 10649, 6816, 0, 0}, {21405, 14361, 9956, 0, 0},
+ {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0},
+ {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0},
+ {12519, 7515, 5351, 0, 0}, {11698, 5250, 2767, 0, 0},
+ {15914, 8299, 4694, 0, 0}, {19904, 12282, 7768, 0, 0},
+ {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0},
+ {28274, 24289, 20862, 0, 0}},
+ {{18808, 13151, 9939, 0, 0}, {21618, 15427, 11540, 0, 0},
+ {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0},
+ {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0},
+ {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0},
+ {15800, 8182, 4738, 0, 0}, {19248, 11713, 7455, 0, 0},
+ {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0},
+ {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0},
+ {10648, 4941, 2535, 0, 0}, {12205, 5410, 2873, 0, 0},
+ {15692, 8124, 4615, 0, 0}, {19406, 11826, 7459, 0, 0},
+ {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0},
+ {27060, 22256, 18271, 0, 0}}},
+ {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0},
+ {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0},
+ {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0},
+ {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0},
+ {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0},
+ {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0},
+ {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0},
+ {14011, 7838, 4994, 0, 0}, {15120, 8172, 4951, 0, 0},
+ {18061, 10716, 6742, 0, 0}, {21048, 13916, 9476, 0, 0},
+ {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0},
+ {28889, 25435, 22440, 0, 0}},
+ {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0},
+ {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0},
+ {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0},
+ {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0},
+ {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0},
+ {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0},
+ {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0},
+ {11020, 4631, 2513, 0, 0}, {13332, 6187, 3208, 0, 0},
+ {16409, 8567, 4815, 0, 0}, {18807, 11075, 6897, 0, 0},
+ {21224, 14082, 9446, 0, 0}, {23396, 16306, 11816, 0, 0},
+ {26630, 21558, 17378, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{16630, 10545, 7259, 0, 0}, {17421, 10338, 6436, 0, 0},
+ {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0},
+ {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0},
+ {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0},
+ {14451, 7299, 4317, 0, 0}, {18850, 11117, 6926, 0, 0},
+ {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0},
+ {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0},
+ {17720, 11701, 8384, 0, 0}, {14566, 7422, 4215, 0, 0},
+ {18466, 10749, 6412, 0, 0}, {21929, 14629, 9602, 0, 0},
+ {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0},
+ {27355, 22433, 18270, 0, 0}},
+ {{15374, 8267, 4873, 0, 0}, {16879, 9348, 5583, 0, 0},
+ {21207, 13635, 8898, 0, 0}, {24483, 17956, 12924, 0, 0},
+ {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0},
+ {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0},
+ {11444, 4853, 2257, 0, 0}, {15441, 7432, 3771, 0, 0},
+ {19351, 11387, 6735, 0, 0}, {22636, 15343, 10430, 0, 0},
+ {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0},
+ {8652, 2988, 1318, 0, 0}, {8915, 3073, 1177, 0, 0},
+ {12683, 5154, 2340, 0, 0}, {17442, 8433, 4193, 0, 0},
+ {20954, 13296, 7958, 0, 0}, {22547, 14157, 8001, 0, 0},
+ {25079, 18210, 12447, 0, 0}}},
+ {{{16554, 10388, 6998, 0, 0}, {18555, 11464, 7473, 0, 0},
+ {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0},
+ {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0},
+ {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0},
+ {14364, 7166, 4042, 0, 0}, {18443, 10788, 6562, 0, 0},
+ {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0},
+ {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0},
+ {16236, 11137, 8293, 0, 0}, {12101, 5618, 3100, 0, 0},
+ {16040, 8258, 4593, 0, 0}, {19907, 12123, 7436, 0, 0},
+ {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0},
+ {27037, 22085, 17856, 0, 0}},
+ {{18335, 11613, 7830, 0, 0}, {18110, 11052, 7223, 0, 0},
+ {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0},
+ {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0},
+ {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0},
+ {12018, 5400, 2947, 0, 0}, {15874, 7940, 4195, 0, 0},
+ {19521, 11492, 7011, 0, 0}, {22730, 15503, 10205, 0, 0},
+ {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0},
+ {10741, 5242, 3054, 0, 0}, {9670, 3622, 1547, 0, 0},
+ {12882, 5427, 2496, 0, 0}, {17159, 9021, 4722, 0, 0},
+ {20775, 12703, 7829, 0, 0}, {23131, 14501, 9097, 0, 0},
+ {25143, 18967, 13624, 0, 0}}},
+ {{{18330, 11970, 8679, 0, 0}, {20147, 13565, 9671, 0, 0},
+ {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0},
+ {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0},
+ {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0},
+ {16231, 8743, 5183, 0, 0}, {19988, 12387, 7901, 0, 0},
+ {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0},
+ {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0},
+ {9949, 5346, 3566, 0, 0}, {10544, 4254, 2047, 0, 0},
+ {15108, 7335, 3855, 0, 0}, {19194, 11286, 6766, 0, 0},
+ {22139, 14791, 9830, 0, 0}, {24156, 17470, 12503, 0, 0},
+ {27161, 22277, 18172, 0, 0}},
+ {{19199, 12968, 9562, 0, 0}, {19640, 12844, 8899, 0, 0},
+ {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0},
+ {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0},
+ {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0},
+ {11855, 5018, 2629, 0, 0}, {16928, 8659, 4934, 0, 0},
+ {20460, 12739, 8199, 0, 0}, {22552, 15983, 11310, 0, 0},
+ {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0},
+ {9602, 3867, 1770, 0, 0}, {10869, 4363, 2017, 0, 0},
+ {14355, 6677, 3325, 0, 0}, {17535, 9654, 5416, 0, 0},
+ {20085, 12296, 7480, 0, 0}, {22066, 14509, 9359, 0, 0},
+ {24643, 18304, 13542, 0, 0}}},
+ {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0},
+ {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0},
+ {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0},
+ {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0},
+ {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0},
+ {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0},
+ {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0},
+ {11561, 5522, 3128, 0, 0}, {13221, 6190, 3271, 0, 0},
+ {16599, 8897, 5078, 0, 0}, {19948, 12310, 7750, 0, 0},
+ {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0},
+ {27731, 23358, 19650, 0, 0}},
+ {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0},
+ {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0},
+ {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0},
+ {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0},
+ {17533, 9890, 6623, 0, 0}, {19783, 12810, 8613, 0, 0},
+ {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0},
+ {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0},
+ {7112, 2166, 874, 0, 0}, {10198, 3661, 1676, 0, 0},
+ {13851, 6345, 3227, 0, 0}, {16828, 9119, 5014, 0, 0},
+ {19965, 12187, 7549, 0, 0}, {21686, 14073, 9392, 0, 0},
+ {24829, 18395, 13763, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}},
+ {{{{14453, 8479, 5217, 0, 0}, {15914, 8700, 4933, 0, 0},
+ {22628, 14841, 9595, 0, 0}, {26046, 19786, 14501, 0, 0},
+ {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0},
+ {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0},
+ {12035, 5088, 2460, 0, 0}, {16736, 8307, 4222, 0, 0},
+ {21115, 12675, 7687, 0, 0}, {23478, 16339, 10682, 0, 0},
+ {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0},
+ {11087, 5036, 2448, 0, 0}, {10379, 3724, 1507, 0, 0},
+ {13741, 6037, 2681, 0, 0}, {18029, 9013, 4144, 0, 0},
+ {21410, 11990, 7257, 0, 0}, {21773, 14695, 8578, 0, 0},
+ {23606, 17778, 12151, 0, 0}},
+ {{11343, 4816, 2380, 0, 0}, {14706, 6930, 3734, 0, 0},
+ {20812, 12887, 7960, 0, 0}, {25050, 17768, 11788, 0, 0},
+ {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0},
+ {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0},
+ {8371, 2872, 932, 0, 0}, {13523, 5640, 2175, 0, 0},
+ {19566, 12943, 6364, 0, 0}, {21190, 13471, 8811, 0, 0},
+ {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0},
+ {5401, 2247, 834, 0, 0}, {7864, 2097, 828, 0, 0},
+ {9693, 4308, 1469, 0, 0}, {18368, 9110, 2351, 0, 0},
+ {18883, 8886, 4443, 0, 0}, {18022, 9830, 4915, 0, 0},
+ {27307, 16384, 5461, 0, 0}}},
+ {{{14494, 7955, 4878, 0, 0}, {17231, 9619, 5765, 0, 0},
+ {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0},
+ {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0},
+ {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0},
+ {12372, 5291, 2620, 0, 0}, {16195, 8139, 4276, 0, 0},
+ {20019, 11922, 7094, 0, 0}, {22535, 14890, 9950, 0, 0},
+ {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0},
+ {12302, 6257, 3482, 0, 0}, {9709, 3594, 1577, 0, 0},
+ {13287, 5505, 2527, 0, 0}, {17310, 9137, 4631, 0, 0},
+ {20352, 12160, 7075, 0, 0}, {22507, 14757, 9507, 0, 0},
+ {24752, 18113, 13102, 0, 0}},
+ {{15152, 8182, 4656, 0, 0}, {16959, 9469, 5613, 0, 0},
+ {22001, 13878, 8975, 0, 0}, {25041, 18513, 13903, 0, 0},
+ {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0},
+ {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0},
+ {10044, 3831, 1807, 0, 0}, {14301, 6444, 3188, 0, 0},
+ {19534, 12055, 7119, 0, 0}, {21587, 15176, 10287, 0, 0},
+ {24477, 14410, 8192, 0, 0}, {25200, 20887, 17784, 0, 0},
+ {7820, 3767, 1621, 0, 0}, {7094, 2149, 617, 0, 0},
+ {11927, 5975, 3165, 0, 0}, {18099, 8412, 4102, 0, 0},
+ {21434, 9175, 4549, 0, 0}, {23846, 18006, 9895, 0, 0},
+ {24467, 19224, 12233, 0, 0}}},
+ {{{15655, 9035, 5687, 0, 0}, {18629, 11362, 7316, 0, 0},
+ {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0},
+ {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0},
+ {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0},
+ {13478, 6058, 3154, 0, 0}, {17832, 9777, 5584, 0, 0},
+ {21530, 13817, 9006, 0, 0}, {23982, 17151, 12180, 0, 0},
+ {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0},
+ {10129, 4546, 2558, 0, 0}, {9552, 3437, 1461, 0, 0},
+ {13693, 6006, 2873, 0, 0}, {17754, 9655, 5311, 0, 0},
+ {20830, 12911, 8016, 0, 0}, {22826, 15488, 10486, 0, 0},
+ {25601, 19624, 15016, 0, 0}},
+ {{16948, 10030, 6280, 0, 0}, {19238, 11883, 7552, 0, 0},
+ {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0},
+ {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0},
+ {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0},
+ {10650, 4710, 2399, 0, 0}, {16270, 8000, 4379, 0, 0},
+ {19848, 11593, 6631, 0, 0}, {22038, 14149, 7416, 0, 0},
+ {22581, 16489, 9977, 0, 0}, {23458, 18137, 10641, 0, 0},
+ {7798, 2210, 711, 0, 0}, {7967, 2826, 1070, 0, 0},
+ {10336, 4315, 1913, 0, 0}, {13714, 7088, 3188, 0, 0},
+ {18376, 9732, 4659, 0, 0}, {20273, 11821, 6118, 0, 0},
+ {20326, 12442, 6554, 0, 0}}},
+ {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0},
+ {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0},
+ {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0},
+ {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0},
+ {17070, 9491, 5600, 0, 0}, {20042, 12400, 7721, 0, 0},
+ {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0},
+ {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0},
+ {9519, 3864, 1821, 0, 0}, {11718, 4860, 2256, 0, 0},
+ {15328, 7428, 3819, 0, 0}, {18709, 10750, 6227, 0, 0},
+ {21480, 13865, 8870, 0, 0}, {23357, 16426, 11340, 0, 0},
+ {26490, 21180, 16824, 0, 0}},
+ {{18787, 12701, 9542, 0, 0}, {15846, 9188, 5985, 0, 0},
+ {21763, 13729, 8281, 0, 0}, {25379, 18550, 12970, 0, 0},
+ {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0},
+ {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0},
+ {11506, 4141, 1640, 0, 0}, {14376, 6314, 2331, 0, 0},
+ {17898, 9858, 5672, 0, 0}, {20148, 13284, 7860, 0, 0},
+ {23478, 16215, 9966, 0, 0}, {26100, 18480, 12764, 0, 0},
+ {5064, 1713, 819, 0, 0}, {8059, 2790, 980, 0, 0},
+ {11100, 3504, 1111, 0, 0}, {14473, 5800, 2694, 0, 0},
+ {16369, 8346, 3455, 0, 0}, {18421, 9742, 4664, 0, 0},
+ {20398, 12962, 8291, 0, 0}}},
+ {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}},
+ {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+ {24576, 16384, 8192, 0, 0}}}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kDcSignContexts][kBooleanFieldCdfSize] = {
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}},
+ {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+ {15488, 0, 0}}}
+};
+/* clang-format on */
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+ 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+ [kBooleanFieldCdfSize] = {
+ {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
+ {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}},
+ {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}},
+ {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}},
+ {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}},
+ {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
+ {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1] = {
+ {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
+ {25629, 21347, 16573, 13224, 9102, 4695, 0, 0},
+ {24980, 20027, 15443, 12268, 8453, 4238, 0, 0},
+ {24497, 18704, 14522, 11204, 7697, 4235, 0, 0},
+ {20043, 13588, 10905, 7929, 5233, 2648, 0, 0},
+ {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
+ {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+ {307, 0, 0}, {11280, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1] = {
+ {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
+ {26929, 17195, 9187, 5821, 2920, 1068, 0, 0},
+ {28342, 21508, 14769, 11285, 6905, 3338, 0, 0},
+ {29540, 23304, 17775, 14679, 10245, 5348, 0, 0},
+ {29000, 23882, 19677, 14916, 10273, 5561, 0, 0},
+ {30304, 24317, 19907, 11136, 7243, 4213, 0, 0},
+ {31499, 27333, 22335, 13805, 11068, 6903, 0,
+ 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
+ [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
+ [kPaletteColorIndexSymbolCount + 1] = {
+ {{{4058, 0, 0},
+ {16384, 0, 0},
+ {22215, 0, 0},
+ {5732, 0, 0},
+ {1165, 0, 0}},
+ {{4891, 2278, 0, 0},
+ {21236, 7071, 0, 0},
+ {26224, 2534, 0, 0},
+ {9750, 4696, 0, 0},
+ {853, 383, 0, 0}},
+ {{7196, 4722, 2723, 0, 0},
+ {23290, 11178, 5512, 0, 0},
+ {25520, 5931, 2944, 0, 0},
+ {13601, 8282, 4419, 0, 0},
+ {1368, 943, 518, 0, 0}},
+ {{7989, 5813, 4192, 2486, 0, 0},
+ {24099, 12404, 8695, 4675, 0, 0},
+ {28513, 5203, 3391, 1701, 0, 0},
+ {12904, 9094, 6052, 3238, 0, 0},
+ {1122, 875, 621, 342, 0, 0}},
+ {{9636, 7361, 5798, 4333, 2695, 0, 0},
+ {25325, 15526, 12051, 8006, 4786, 0, 0},
+ {26468, 7906, 5824, 3984, 2097, 0, 0},
+ {13852, 9873, 7501, 5333, 3116, 0, 0},
+ {1498, 1218, 960, 709, 415, 0, 0}},
+ {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0},
+ {25818, 17321, 13816, 10087, 7201, 4205, 0, 0},
+ {25208, 9294, 7278, 5565, 3847, 2060, 0, 0},
+ {14224, 10395, 8311, 6573, 4649, 2723, 0, 0},
+ {1570, 1317, 1098, 886, 645, 377, 0, 0}},
+ {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0},
+ {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0},
+ {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0},
+ {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0},
+ {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}},
+ {{{3679, 0, 0},
+ {16384, 0, 0},
+ {24055, 0, 0},
+ {3511, 0, 0},
+ {1158, 0, 0}},
+ {{7511, 3623, 0, 0},
+ {20481, 5475, 0, 0},
+ {25735, 4808, 0, 0},
+ {12623, 7363, 0, 0},
+ {2160, 1129, 0, 0}},
+ {{8558, 5593, 2865, 0, 0},
+ {22880, 10382, 5554, 0, 0},
+ {26867, 6715, 3475, 0, 0},
+ {14450, 10616, 4435, 0, 0},
+ {2309, 1632, 842, 0, 0}},
+ {{9788, 7289, 4987, 2782, 0, 0},
+ {24355, 11360, 7909, 3894, 0, 0},
+ {30511, 3319, 2174, 1170, 0, 0},
+ {13579, 11566, 6853, 4148, 0, 0},
+ {924, 724, 487, 250, 0, 0}},
+ {{10551, 8201, 6131, 4085, 2220, 0, 0},
+ {25461, 16362, 13132, 8136, 4344, 0, 0},
+ {28327, 7704, 5889, 3826, 1849, 0, 0},
+ {15558, 12240, 9449, 6018, 3186, 0, 0},
+ {2094, 1815, 1372, 1033, 561, 0, 0}},
+ {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0},
+ {26223, 17756, 14764, 10951, 7265, 4067, 0, 0},
+ {29320, 6473, 5331, 4064, 2642, 1326, 0, 0},
+ {16879, 14445, 11064, 8070, 5792, 3078, 0, 0},
+ {1780, 1564, 1289, 1034, 785, 443, 0, 0}},
+ {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0},
+ {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0},
+ {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0},
+ {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
+ {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+ {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
+ [kBooleanFieldCdfSize] = {{5940, 0, 0},
+ {8733, 0, 0},
+ {20737, 0, 0},
+ {22128, 0, 0},
+ {29867, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
+ [kBooleanFieldCdfSize] = {{31570, 0, 0},
+ {30698, 0, 0},
+ {23602, 0, 0},
+ {25269, 0, 0},
+ {10293, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
+ [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
+ {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
+ {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
+ {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}},
+ {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}},
+ {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
+ {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
+ [kBooleanFieldCdfSize] = {
+ {{30533, 0, 0}, {31345, 0, 0}},
+ {{15586, 0, 0}, {17593, 0, 0}},
+ {{2162, 0, 0}, {2279, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+ [kBooleanFieldCdfSize] = {
+ {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
+ {31324, 0, 0}},
+ {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0},
+ {17681, 0, 0}},
+ {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0},
+ {2464, 0, 0}}};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
+ [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
+ {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
+ {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
+ {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0},
+ {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0},
+ {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0},
+ {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0},
+ {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
+ {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+ {8733, 0, 0}, {16138, 0, 0}, {17429, 0, 0},
+ {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+ {31714, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
+ {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
+ {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
+
+// This is called drl_mode in the spec where DRL stands for Dynamic Reference
+// List.
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
+ {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
+ {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+ {30893, 21686, 5436, 0, 0},
+ {30295, 22772, 6380, 0, 0},
+ {28530, 21231, 6842, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {12732, 0, 0}, {7811, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {6064, 0, 0}, {5238, 0, 0}, {3204, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {3324, 0, 0}, {5896, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406,
+ 8641, 7066, 5016, 3318, 1597, 0, 0},
+ {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401,
+ 14316, 13346, 9929, 6641, 3139, 0, 0},
+ {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484,
+ 7783, 7084, 5509, 3885, 1857, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436,
+ 8248, 5298, 3312, 2239, 1112, 0, 0},
+ {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807,
+ 9884, 8297, 6049, 4054, 1891, 0, 0},
+ {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038,
+ 14654, 13455, 10247, 6756, 3218, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711,
+ 5517, 3595, 2679, 1808, 835, 0, 0},
+ {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235,
+ 9334, 6796, 4824, 3198, 1352, 0, 0},
+ {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730,
+ 8805, 7457, 5780, 4002, 1756, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0},
+ {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+ 10240, 8192, 6144, 4096, 2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0}, {16384, 0, 0},
+ {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+ {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0}, {9945, 0, 0},
+ {5889, 0, 0}, {10685, 0, 0}, {2640, 0, 0}, {1754, 0, 0},
+ {1208, 0, 0}, {130, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+ {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}, {25117, 8008, 0, 0}, {28030, 8003, 0, 0},
+ {3969, 1378, 0, 0}, {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+ {13349, 5958, 0, 0}, {27645, 9162, 0, 0}, {3795, 1174, 0, 0},
+ {6337, 1994, 0, 0}, {21162, 8460, 0, 0}, {6508, 3652, 0, 0},
+ {12408, 4706, 0, 0}, {3026, 1565, 0, 0}, {11089, 5938, 0, 0},
+ {3252, 2067, 0, 0}, {3870, 2371, 0, 0}, {1890, 1433, 0, 0},
+ {261, 210, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
+ [kBooleanFieldCdfSize] = {
+ {6161, 0, 0}, {9877, 0, 0},
+ {13928, 0, 0}, {8174, 0, 0},
+ {12834, 0, 0}, {10094, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
+ [kBooleanFieldCdfSize] = {
+ {14524, 0, 0}, {19903, 0, 0},
+ {25715, 0, 0}, {19509, 0, 0},
+ {23434, 0, 0}, {28124, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundTypeCdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {9337, 0, 0}, {19597, 0, 0},
+ {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+ {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+ {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
+ [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
+ {833, 48, 0, 0}, {27200, 49, 0, 0}, {32346, 29830, 0, 0},
+ {4524, 160, 0, 0}, {1562, 815, 0, 0}, {27906, 647, 0, 0},
+ {31998, 31616, 0, 0}, {11879, 7131, 0, 0}, {858, 44, 0, 0},
+ {28648, 56, 0, 0}, {32463, 30521, 0, 0}, {5365, 132, 0, 0},
+ {1746, 759, 0, 0}, {29805, 675, 0, 0}, {32167, 31825, 0, 0},
+ {17799, 11370, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+ 4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+ 1] = {
+ {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+ {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+ {10240, 0, 0}, {8192, 0, 0}, {4096, 0, 0}, {2816, 0, 0},
+ {2816, 0, 0}, {2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+ 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SymbolDecoderContextTest, ResetIntraFrameYModeCdf) {
+ // Note these are zero-initialized separately to avoid differences in padding
+ // values added to tables for alignment purposes when comparing the contexts
+ // with memcmp().
+ libgav1::SymbolDecoderContext gold_context = {};
+ libgav1::SymbolDecoderContext context = {};
+ gold_context.Initialize(0);
+ context.Initialize(0);
+ EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+ EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][0], 32768 - 15588);
+ EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][1], 32768 - 17027);
+ ++context.intra_frame_y_mode_cdf[0][0][0];
+ --context.intra_frame_y_mode_cdf[0][0][1];
+ EXPECT_NE(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+ context.ResetIntraFrameYModeCdf();
+ EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+}
+
+void ResetAndVerifyCounters(libgav1::SymbolDecoderContext* const context) {
+ libgav1::SymbolDecoderContext gold_context = {};
+ gold_context.Initialize(0);
+ EXPECT_NE(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+ context->ResetCounters();
+ EXPECT_EQ(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+}
+
+TEST(SymbolDecoderContextTest, ResetCounters1d) {
+ libgav1::SymbolDecoderContext context = {};
+ context.Initialize(0);
+ int value = 0;
+ context.delta_q_cdf[libgav1::kDeltaSymbolCount] = ++value;
+ context.delta_lf_cdf[libgav1::kDeltaSymbolCount] = ++value;
+ context.intra_block_copy_cdf[libgav1::kBooleanSymbolCount] = ++value;
+ context.cfl_alpha_signs_cdf[libgav1::kCflAlphaSignsSymbolCount] = ++value;
+ context.filter_intra_mode_cdf[libgav1::kNumFilterIntraPredictors] = ++value;
+ context.restoration_type_cdf[libgav1::kRestorationTypeSymbolCount] = ++value;
+ context.use_wiener_cdf[libgav1::kBooleanSymbolCount] = ++value;
+ context.use_sgrproj_cdf[libgav1::kBooleanSymbolCount] = ++value;
+ ResetAndVerifyCounters(&context);
+}
+
+void IncreasePartitionCounters(SymbolDecoderContext* symbol_context,
+ int value) {
+ const int min_bsize_log2 = k4x4WidthLog2[kBlock8x8];
+ const int max_bsize_log2 = k4x4WidthLog2[kBlock128x128];
+ for (int block_size_log2 = min_bsize_log2; block_size_log2 <= max_bsize_log2;
+ ++block_size_log2) {
+ for (int context = 0; context < kPartitionContexts; ++context) {
+ const int cdf_size =
+ SymbolDecoderContext::PartitionCdfSize(block_size_log2);
+ symbol_context->partition_cdf[block_size_log2 - min_bsize_log2][context]
+ [cdf_size] += value;
+ }
+ }
+}
+
+void IncreasePaletteColorIndexCounters(SymbolDecoderContext* symbol_context,
+ int value) {
+ for (auto& palette_color_index_cdf_plane :
+ symbol_context->palette_color_index_cdf) {
+ for (int symbol_count = 0; symbol_count < kPaletteSizeSymbolCount;
+ ++symbol_count) {
+ const int cdf_size = symbol_count + kMinPaletteSize;
+ for (int context = 0; context < kPaletteColorIndexContexts; ++context) {
+ palette_color_index_cdf_plane[symbol_count][context][cdf_size] += value;
+ }
+ }
+ }
+}
+
+void IncreaseTxTypeCounters(SymbolDecoderContext* context, int value) {
+ for (int set_idx = kTransformSetIntra1; set_idx <= kTransformSetIntra2;
+ ++set_idx) {
+ auto tx_set = static_cast<TransformSet>(set_idx);
+ for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+ for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+ context->intra_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(
+ tx_set)][tx_size][mode][kNumTransformTypesInSet[tx_set]] += value;
+ }
+ }
+ }
+
+ for (int set_idx = kTransformSetInter1; set_idx <= kTransformSetInter3;
+ ++set_idx) {
+ auto tx_set = static_cast<TransformSet>(set_idx);
+ for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+ context->inter_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(tx_set)]
+ [tx_size][kNumTransformTypesInSet[tx_set]] +=
+ value;
+ }
+ }
+}
+
+void IncreaseTxDepthCounters(SymbolDecoderContext* symbol_context, int value) {
+ for (int context = 0; context < kTxDepthContexts; ++context) {
+ symbol_context->tx_depth_cdf[0][context][kMaxTxDepthSymbolCount - 1] +=
+ value;
+ }
+
+ for (int plane_category = 1; plane_category < 4; ++plane_category) {
+ for (int context = 0; context < kTxDepthContexts; ++context) {
+ symbol_context
+ ->tx_depth_cdf[plane_category][context][kMaxTxDepthSymbolCount] +=
+ value;
+ }
+ }
+}
+
+void IncreaseUVModeCounters(SymbolDecoderContext* symbol_context, int value) {
+ for (int cfl_allowed = 0; cfl_allowed < kBooleanSymbolCount; ++cfl_allowed) {
+ for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+ symbol_context->uv_mode_cdf[cfl_allowed][mode][kIntraPredictionModesUV -
+ (1 - cfl_allowed)] +=
+ value;
+ }
+ }
+}
+
+#define ASSIGN_COUNTER_2D(array, offset) \
+ do { \
+ for (auto& d1 : context.array) { \
+ d1[libgav1::offset] = ++value; \
+ } \
+ } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters2d) {
+ libgav1::SymbolDecoderContext context = {};
+ context.Initialize(0);
+ int value = 0;
+ ASSIGN_COUNTER_2D(segment_id_cdf, kMaxSegments);
+ ASSIGN_COUNTER_2D(use_predicted_segment_id_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(skip_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(skip_mode_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(delta_lf_multi_cdf, kDeltaSymbolCount);
+ ASSIGN_COUNTER_2D(y_mode_cdf, kIntraPredictionModesY);
+ ASSIGN_COUNTER_2D(angle_delta_cdf, kAngleDeltaSymbolCount);
+ ASSIGN_COUNTER_2D(cfl_alpha_cdf, kCflAlphaSymbolCount);
+ ASSIGN_COUNTER_2D(use_filter_intra_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(tx_split_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(eob_pt_512_cdf, kEobPt512SymbolCount);
+ ASSIGN_COUNTER_2D(eob_pt_1024_cdf, kEobPt1024SymbolCount);
+ ASSIGN_COUNTER_2D(palette_y_size_cdf, kPaletteSizeSymbolCount);
+ ASSIGN_COUNTER_2D(has_palette_uv_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(palette_uv_size_cdf, kPaletteSizeSymbolCount);
+ ASSIGN_COUNTER_2D(is_inter_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(use_compound_reference_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(compound_reference_type_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(compound_prediction_mode_cdf,
+ kNumCompoundInterPredictionModes);
+ ASSIGN_COUNTER_2D(new_mv_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(zero_mv_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(reference_mv_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(ref_mv_index_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(is_inter_intra_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(inter_intra_mode_cdf, kNumInterIntraModes);
+ ASSIGN_COUNTER_2D(is_wedge_inter_intra_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(wedge_index_cdf, kWedgeIndexSymbolCount);
+ ASSIGN_COUNTER_2D(use_obmc_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(motion_mode_cdf, kNumMotionModes);
+ ASSIGN_COUNTER_2D(is_explicit_compound_type_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(is_compound_type_average_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_2D(compound_type_cdf, kNumExplicitCompoundPredictionTypes);
+ ASSIGN_COUNTER_2D(interpolation_filter_cdf, kNumExplicitInterpolationFilters);
+ ASSIGN_COUNTER_2D(mv_joint_cdf, kNumMvJointTypes);
+ ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_2D
+
+#define ASSIGN_COUNTER_3D(array, offset) \
+ do { \
+ for (auto& d1 : context.array) { \
+ for (auto& d2 : d1) { \
+ d2[libgav1::offset] = ++value; \
+ } \
+ } \
+ } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters3d) {
+ libgav1::SymbolDecoderContext context = {};
+ context.Initialize(0);
+ int value = 0;
+ ASSIGN_COUNTER_3D(intra_frame_y_mode_cdf, kIntraPredictionModesY);
+ ASSIGN_COUNTER_3D(all_zero_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(eob_pt_16_cdf, kEobPt16SymbolCount);
+ ASSIGN_COUNTER_3D(eob_pt_32_cdf, kEobPt32SymbolCount);
+ ASSIGN_COUNTER_3D(eob_pt_64_cdf, kEobPt64SymbolCount);
+ ASSIGN_COUNTER_3D(eob_pt_128_cdf, kEobPt128SymbolCount);
+ ASSIGN_COUNTER_3D(eob_pt_256_cdf, kEobPt256SymbolCount);
+ ASSIGN_COUNTER_3D(dc_sign_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(has_palette_y_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(compound_backward_reference_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(single_reference_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(mv_sign_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(mv_class_cdf, kMvClassSymbolCount);
+ ASSIGN_COUNTER_3D(mv_class0_bit_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(mv_class0_high_precision_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_3D(mv_fraction_cdf, kMvFractionSymbolCount);
+ ASSIGN_COUNTER_3D(mv_high_precision_cdf, kBooleanSymbolCount);
+ IncreasePartitionCounters(&context, value);
+ IncreaseTxTypeCounters(&context, value);
+ IncreaseTxDepthCounters(&context, value);
+ IncreaseUVModeCounters(&context, value);
+ ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_3D
+
+#define ASSIGN_COUNTER_4D(array, offset) \
+ do { \
+ for (auto& d1 : context.array) { \
+ for (auto& d2 : d1) { \
+ for (auto& d3 : d2) { \
+ d3[libgav1::offset] = ++value; \
+ } \
+ } \
+ } \
+ } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters4d) {
+ libgav1::SymbolDecoderContext context = {};
+ context.Initialize(0);
+ int value = 0;
+ ASSIGN_COUNTER_4D(eob_extra_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_4D(coeff_base_eob_cdf, kCoeffBaseEobSymbolCount);
+ ASSIGN_COUNTER_4D(coeff_base_cdf, kCoeffBaseSymbolCount);
+ ASSIGN_COUNTER_4D(coeff_base_range_cdf, kCoeffBaseRangeSymbolCount);
+ ASSIGN_COUNTER_4D(compound_reference_cdf, kBooleanSymbolCount);
+ ASSIGN_COUNTER_4D(mv_class0_fraction_cdf, kMvFractionSymbolCount);
+ ASSIGN_COUNTER_4D(mv_bit_cdf, kBooleanSymbolCount);
+ IncreasePaletteColorIndexCounters(&context, value);
+ IncreaseTxTypeCounters(&context, value);
+ ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_4D
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+namespace {
+
+#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
+constexpr int kFrameParallelThresholdMultiplier = 3;
+#else
+constexpr int kFrameParallelThresholdMultiplier =
+ LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
+#endif
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+// * If |thread_count| == 1, return 0.
+// * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+// return 0.
+// * Otherwise, return the largest value of i which satisfies the following
+// condition: i + i * tile_columns <= thread_count. This ensures that there
+// are at least |tile_columns| worker threads for each frame thread.
+// * This function will never return 1 or a value > |thread_count|.
+//
+// This heuristic is based on empirical performance data. The in-frame
+// threading model (combination of tile multithreading, superblock row
+// multithreading and post filter multithreading) performs better than the
+// frame parallel model until we reach the threshold of |thread_count| >
+// |tile_count| * kFrameParallelThresholdMultiplier.
+//
+// It is a function of |tile_count| since tile threading and superblock row
+// multithreading will scale only as a factor of |tile_count|. The threshold
+// kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+// The general idea is that superblock row multithreading plateaus at 4 *
+// |tile_count| because in most practical cases there aren't more than that
+// many superblock rows and columns available to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+ int tile_columns) {
+ assert(thread_count > 0);
+ if (thread_count == 1) return 0;
+ return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
+ ? 0
+ : std::max(2, thread_count / (1 + tile_columns));
+}
+
+} // namespace
+
+bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
+ int thread_count) {
+ assert(thread_count > 0);
+ frame_parallel_ = false;
+
+ if (thread_count == 1) {
+ thread_pool_.reset(nullptr);
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+ return true;
+ }
+
+ // We do work in the current thread, so it is sufficient to create
+ // |thread_count|-1 threads in the threadpool.
+ thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
+
+ if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+ thread_pool_ = ThreadPool::Create("libgav1", thread_count);
+ if (thread_pool_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+ thread_count);
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+ return false;
+ }
+ }
+
+ // Prefer tile threads first (but only if there is more than one tile).
+ const int tile_count = frame_header.tile_info.tile_count;
+ if (tile_count > 1) {
+ // We want 1 + tile_thread_count_ <= tile_count because the current thread
+ // is also used to decode tiles. This is equivalent to
+ // tile_thread_count_ <= tile_count - 1.
+ tile_thread_count_ = std::min(thread_count, tile_count - 1);
+ thread_count -= tile_thread_count_;
+ if (thread_count == 0) {
+ max_tile_index_for_row_threads_ = 0;
+ return true;
+ }
+ } else {
+ tile_thread_count_ = 0;
+ }
+
+#if defined(__ANDROID__)
+ // Assign the remaining threads for each Tile. The heuristic used here is that
+ // we will assign two threads for each Tile. So for example, if |thread_count|
+ // is 2, for a stream with 2 tiles the first tile would get both the threads
+ // and the second tile would have row multi-threading turned off. This
+ // heuristic is based on the fact that row multi-threading is fast enough only
+ // when there are at least two threads to do the decoding (since one thread
+ // always does the parsing).
+ //
+ // This heuristic might stop working when SIMD optimizations make the decoding
+ // much faster and the parsing thread is only as fast as the decoding threads.
+ // So we will have to revisit this later to make sure that this is still
+ // optimal.
+ //
+ // Note that while this heuristic significantly improves performance on high
+ // end devices (like the Pixel 3), there are some performance regressions in
+ // some lower end devices (in some cases) and that needs to be revisited as we
+ // bring in more optimizations. Overall, the gains because of this heuristic
+ // seems to be much larger than the regressions.
+ for (int i = 0; i < tile_count; ++i) {
+ max_tile_index_for_row_threads_ = i + 1;
+ thread_count -= 2;
+ if (thread_count <= 0) break;
+ }
+#else // !defined(__ANDROID__)
+ // Assign the remaining threads to each Tile.
+ for (int i = 0; i < tile_count; ++i) {
+ const int count = thread_count / tile_count +
+ static_cast<int>(i < thread_count % tile_count);
+ if (count == 0) {
+ // Once we see a 0 value, all subsequent values will be 0 since it is
+ // supposed to be assigned in a round-robin fashion.
+ break;
+ }
+ max_tile_index_for_row_threads_ = i + 1;
+ }
+#endif // defined(__ANDROID__)
+ return true;
+}
+
+bool ThreadingStrategy::Reset(int thread_count) {
+ assert(thread_count > 0);
+ frame_parallel_ = true;
+
+ // In frame parallel mode, we simply access the underlying |thread_pool_|
+ // directly. So ensure all the other threadpool getter functions return
+ // nullptr. Also, superblock row multithreading is always disabled in frame
+ // parallel mode.
+ tile_thread_count_ = 0;
+ max_tile_index_for_row_threads_ = 0;
+
+ if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+ thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+ if (thread_pool_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+ thread_count);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool InitializeThreadPoolsForFrameParallel(
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* const frame_thread_pool,
+ FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+ assert(*frame_thread_pool == nullptr);
+ thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+ const int frame_threads =
+ ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+ if (frame_threads == 0) return true;
+ *frame_thread_pool = ThreadPool::Create(frame_threads);
+ if (*frame_thread_pool == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
+ frame_threads);
+ return false;
+ }
+ int remaining_threads = thread_count - frame_threads;
+ if (remaining_threads == 0) return true;
+ int threads_per_frame = remaining_threads / frame_threads;
+ const int extra_threads = remaining_threads % frame_threads;
+ Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+ if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+ // Create the tile thread pools.
+ for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool->Get();
+ if (frame_scratch_buffer == nullptr) {
+ return false;
+ }
+ // If the number of tile threads cannot be divided equally amongst all the
+ // frame threads, assign one extra thread to the first |extra_threads| frame
+ // threads.
+ const int current_frame_thread_count =
+ threads_per_frame + static_cast<int>(i < extra_threads);
+ if (!frame_scratch_buffer->threading_strategy.Reset(
+ current_frame_thread_count)) {
+ return false;
+ }
+ remaining_threads -= current_frame_thread_count;
+ frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+ }
+ // We release the frame scratch buffers in reverse order so that the extra
+ // threads are allocated to buffers in the top of the stack.
+ for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+ --i) {
+ frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+ }
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_
+#define LIBGAV1_SRC_THREADING_STRATEGY_H_
+
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+class FrameScratchBufferPool;
+
+// This class allocates and manages the worker threads among thread pools used
+// for multi-threaded decoding.
+class ThreadingStrategy {
+ public:
+ ThreadingStrategy() = default;
+
+ // Not copyable or movable.
+ ThreadingStrategy(const ThreadingStrategy&) = delete;
+ ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
+
+ // Creates or re-allocates the thread pools based on the |frame_header| and
+ // |thread_count|. This function is used only in non frame-parallel mode. This
+ // function is idempotent if the |frame_header| and |thread_count| don't
+ // change between calls (it will only create new threads on the first call and
+ // do nothing on the subsequent calls). This function also starts the worker
+ // threads whenever it creates new thread pools.
+ // The following strategy is used to allocate threads:
+ // * One thread is allocated for decoding each Tile.
+ // * Any remaining threads are allocated for superblock row multi-threading
+ // within each of the tile in a round robin fashion.
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
+ int thread_count);
+
+ // Creates or re-allocates a thread pool with |thread_count| threads. This
+ // function is used only in frame parallel mode. This function is idempotent
+ // if the |thread_count| doesn't change between calls (it will only create new
+ // threads on the first call and do nothing on the subsequent calls).
+ // Note: During the lifetime of a ThreadingStrategy object, only one of the
+ // Reset() variants will be used.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
+ // Returns a pointer to the ThreadPool that is to be used for Tile
+ // multi-threading.
+ ThreadPool* tile_thread_pool() const {
+ return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr;
+ }
+
+ int tile_thread_count() const { return tile_thread_count_; }
+
+ // Returns a pointer to the underlying ThreadPool.
+ // Note: Valid only when |frame_parallel_| is true. This is used for
+ // facilitating in-frame multi-threading in that case.
+ ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
+ // Returns a pointer to the ThreadPool that is to be used within the Tile at
+ // index |tile_index| for superblock row multi-threading.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* row_thread_pool(int tile_index) const {
+ return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
+ : nullptr;
+ }
+
+ // Returns a pointer to the ThreadPool that is to be used for post filter
+ // multi-threading.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* post_filter_thread_pool() const {
+ return frame_parallel_ ? nullptr : thread_pool_.get();
+ }
+
+ // Returns a pointer to the ThreadPool that is to be used for film grain
+ // synthesis and blending.
+ // Note: Valid only when |frame_parallel_| is false.
+ ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
+
+ private:
+ std::unique_ptr<ThreadPool> thread_pool_;
+ int tile_thread_count_ = 0;
+ int max_tile_index_for_row_threads_ = 0;
+ bool frame_parallel_ = false;
+};
+
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+// * frame_threads = ComputeFrameThreadCount();
+// * For more details on how frame_threads is computed, see the function
+// comment in ComputeFrameThreadCount().
+// * |frame_thread_pool| is created with |frame_threads| threads.
+// * divide the remaining number of threads into each frame thread and
+// initialize a frame_scratch_buffer.threading_strategy for each frame
+// thread.
+// When this function is called, |frame_scratch_buffer_pool| must be empty. If
+// this function returns true, it means the initialization was successful and
+// one of the following is true:
+// * |frame_thread_pool| has been successfully initialized and
+// |frame_scratch_buffer_pool| has been successfully populated with
+// |frame_threads| buffers to be used by each frame thread. The total
+// number of threads that this function creates will always be equal to
+// |thread_count|.
+// * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+// modified. This means that frame threading will not be used and the
+// decoder will continue to operate normally in non frame parallel mode.
+LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
+ int thread_count, int tile_count, int tile_columns,
+ std::unique_ptr<ThreadPool>* frame_thread_pool,
+ FrameScratchBufferPool* frame_scratch_buffer_pool);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_THREADING_STRATEGY_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+class ThreadingStrategyTest : public testing::Test {
+ protected:
+ ThreadingStrategy strategy_;
+ ObuFrameHeader frame_header_ = {};
+};
+
+TEST_F(ThreadingStrategyTest, MaxThreadEnforced) {
+ frame_header_.tile_info.tile_count = 32;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 32));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 32; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, UseAllThreadsForTiles) {
+ frame_header_.tile_info.tile_count = 8;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreads) {
+ frame_header_.tile_info.tile_count = 2;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ // Each tile should get 3 threads each.
+ for (int i = 0; i < 2; ++i) {
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreadsUnequal) {
+ frame_header_.tile_info.tile_count = 2;
+
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 9));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+ EXPECT_NE(strategy_.row_thread_pool(1), nullptr);
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Test a random combination of tile_count and thread_count.
+TEST_F(ThreadingStrategyTest, MultipleCalls) {
+ frame_header_.tile_info.tile_count = 2;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 2; ++i) {
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 8;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ // Row threads must have been reset.
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 8;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 8; ++i) {
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i >= 4) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 4;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 4; ++i) {
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ }
+ // All the other row threads must be reset.
+ for (int i = 4; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 4;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 6));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ // First two tiles will get 1 thread each.
+ for (int i = 0; i < 2; ++i) {
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i == 1) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ }
+ // All the other row threads must be reset.
+ for (int i = 2; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 1));
+ EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_EQ(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Tests the following order of calls (with thread count fixed at 4):
+// * 1 Tile - 2 Tiles - 1 Tile.
+TEST_F(ThreadingStrategyTest, MultipleCalls2) {
+ frame_header_.tile_info.tile_count = 1;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+ // When there is only one tile, tile thread pool must be nullptr.
+ EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+ EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+ for (int i = 1; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 2;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+ EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+ for (int i = 0; i < 2; ++i) {
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i == 1) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ }
+ for (int i = 2; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+ frame_header_.tile_info.tile_count = 1;
+ ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+ EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+ EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+ for (int i = 1; i < 8; ++i) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ }
+ EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+void VerifyFrameParallel(int thread_count, int tile_count, int tile_columns,
+ int expected_frame_threads,
+ const std::vector<int>& expected_tile_threads) {
+ ASSERT_EQ(expected_frame_threads, expected_tile_threads.size());
+ ASSERT_GT(thread_count, 1);
+ std::unique_ptr<ThreadPool> frame_thread_pool;
+ FrameScratchBufferPool frame_scratch_buffer_pool;
+ ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+ thread_count, tile_count, tile_columns, &frame_thread_pool,
+ &frame_scratch_buffer_pool));
+ if (expected_frame_threads == 0) {
+ EXPECT_EQ(frame_thread_pool, nullptr);
+ return;
+ }
+ EXPECT_NE(frame_thread_pool.get(), nullptr);
+ EXPECT_EQ(frame_thread_pool->num_threads(), expected_frame_threads);
+ std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+ int actual_thread_count = frame_thread_pool->num_threads();
+ for (int i = 0; i < expected_frame_threads; ++i) {
+ SCOPED_TRACE(absl::StrCat("i: ", i));
+ frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+ ThreadPool* const thread_pool =
+ frame_scratch_buffers.back()->threading_strategy.thread_pool();
+ if (expected_tile_threads[i] > 0) {
+ EXPECT_NE(thread_pool, nullptr);
+ EXPECT_EQ(thread_pool->num_threads(), expected_tile_threads[i]);
+ actual_thread_count += thread_pool->num_threads();
+ } else {
+ EXPECT_EQ(thread_pool, nullptr);
+ }
+ }
+ EXPECT_EQ(thread_count, actual_thread_count);
+ for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+ frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+ }
+}
+
+TEST(FrameParallelStrategyTest, FrameParallel) {
+ // This loop has thread_count <= 3 * tile count. So there should be no frame
+ // threads irrespective of the number of tile columns.
+ for (int thread_count = 2; thread_count <= 6; ++thread_count) {
+ VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/1,
+ /*expected_frame_threads=*/0,
+ /*expected_tile_threads=*/{});
+ VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/2,
+ /*expected_frame_threads=*/0,
+ /*expected_tile_threads=*/{});
+ }
+
+ // Equal number of tile threads for each frame thread.
+ VerifyFrameParallel(
+ /*thread_count=*/8, /*tile_count=*/1, /*tile_columns=*/1,
+ /*expected_frame_threads=*/4, /*expected_tile_threads=*/{1, 1, 1, 1});
+ VerifyFrameParallel(
+ /*thread_count=*/12, /*tile_count=*/2, /*tile_columns=*/2,
+ /*expected_frame_threads=*/4, /*expected_tile_threads=*/{2, 2, 2, 2});
+ VerifyFrameParallel(
+ /*thread_count=*/18, /*tile_count=*/2, /*tile_columns=*/2,
+ /*expected_frame_threads=*/6,
+ /*expected_tile_threads=*/{2, 2, 2, 2, 2, 2});
+ VerifyFrameParallel(
+ /*thread_count=*/16, /*tile_count=*/3, /*tile_columns=*/3,
+ /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 3, 3});
+
+ // Unequal number of tile threads for each frame thread.
+ VerifyFrameParallel(
+ /*thread_count=*/7, /*tile_count=*/1, /*tile_columns=*/1,
+ /*expected_frame_threads=*/3, /*expected_tile_threads=*/{2, 1, 1});
+ VerifyFrameParallel(
+ /*thread_count=*/14, /*tile_count=*/2, /*tile_columns=*/2,
+ /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 2, 2});
+ VerifyFrameParallel(
+ /*thread_count=*/20, /*tile_count=*/2, /*tile_columns=*/2,
+ /*expected_frame_threads=*/6,
+ /*expected_tile_threads=*/{3, 3, 2, 2, 2, 2});
+ VerifyFrameParallel(
+ /*thread_count=*/17, /*tile_count=*/3, /*tile_columns=*/3,
+ /*expected_frame_threads=*/4, /*expected_tile_threads=*/{4, 3, 3, 3});
+}
+
+TEST(FrameParallelStrategyTest, ThreadCountDoesNotExceedkMaxThreads) {
+ std::unique_ptr<ThreadPool> frame_thread_pool;
+ FrameScratchBufferPool frame_scratch_buffer_pool;
+ ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+ /*thread_count=*/kMaxThreads + 10, /*tile_count=*/2, /*tile_columns=*/2,
+ &frame_thread_pool, &frame_scratch_buffer_pool));
+ EXPECT_NE(frame_thread_pool.get(), nullptr);
+ std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+ int actual_thread_count = frame_thread_pool->num_threads();
+ for (int i = 0; i < frame_thread_pool->num_threads(); ++i) {
+ SCOPED_TRACE(absl::StrCat("i: ", i));
+ frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+ ThreadPool* const thread_pool =
+ frame_scratch_buffers.back()->threading_strategy.thread_pool();
+ if (thread_pool != nullptr) {
+ actual_thread_count += thread_pool->num_threads();
+ }
+ }
+ // In this case, the exact number of frame threads and tile threads depend on
+ // the value of kMaxThreads. So simply ensure that the total number of threads
+ // does not exceed kMaxThreads.
+ EXPECT_LE(actual_thread_count, kMaxThreads);
+ for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+ frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_H_
+#define LIBGAV1_SRC_TILE_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Indicates what the ProcessSuperBlock() and TransformBlock() functions should
+// do. "Parse" refers to consuming the bitstream, reading the transform
+// coefficients and performing the dequantization. "Decode" refers to computing
+// the prediction, applying the inverse transforms and adding the residual.
+enum ProcessingMode {
+ kProcessingModeParseOnly,
+ kProcessingModeDecodeOnly,
+ kProcessingModeParseAndDecode,
+};
+
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context_.
+class Tile : public MaxAlignedAllocable {
+ public:
+ static std::unique_ptr<Tile> Create(
+ int tile_number, const uint8_t* const data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
+ const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* const saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
+ const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+ BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer) {
+ std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
+ tile_number, data, size, sequence_header, frame_header, current_frame,
+ state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+ saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+ thread_pool, pending_tiles, frame_parallel,
+ use_intra_prediction_buffer));
+ return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
+ }
+
+ // Move only.
+ Tile(Tile&& tile) noexcept;
+ Tile& operator=(Tile&& tile) noexcept;
+ Tile(const Tile&) = delete;
+ Tile& operator=(const Tile&) = delete;
+
+ struct Block; // Defined after this class.
+
+ // Parses the entire tile.
+ bool Parse();
+ // Decodes the entire tile. |superblock_row_progress| and
+ // |superblock_row_progress_condvar| are arrays of size equal to the number of
+ // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+ // each superblock row at index |i| is decoded. If the count reaches the
+ // number of tile columns, then it notifies
+ // |superblock_row_progress_condvar[i]|.
+ bool Decode(std::mutex* mutex, int* superblock_row_progress,
+ std::condition_variable* superblock_row_progress_condvar);
+ // Parses and decodes the entire tile. Depending on the configuration of this
+ // Tile, this function may do multithreaded decoding.
+ bool ParseAndDecode(); // 5.11.2.
+ // Processes all the columns of the superblock row at |row4x4| that are within
+ // this Tile. If |save_symbol_decoder_context| is true, then
+ // SaveSymbolDecoderContext() is invoked for the last superblock row.
+ template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+ bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer);
+
+ const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+ const ObuFrameHeader& frame_header() const { return frame_header_; }
+ const RefCountedBuffer& current_frame() const { return current_frame_; }
+ const TemporalMotionField& motion_field() const { return motion_field_; }
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias()
+ const {
+ return reference_frame_sign_bias_;
+ }
+
+ bool IsRow4x4Inside(int row4x4) const {
+ return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+ }
+
+ // 5.11.51.
+ bool IsInside(int row4x4, int column4x4) const {
+ return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+ column4x4 < column4x4_end_;
+ }
+
+ bool IsLeftInside(int column4x4) const {
+ // We use "larger than" as the condition. Don't pass in the left column
+ // offset column4x4 - 1.
+ assert(column4x4 <= column4x4_end_);
+ return column4x4 > column4x4_start_;
+ }
+
+ bool IsTopInside(int row4x4) const {
+ // We use "larger than" as the condition. Don't pass in the top row offset
+ // row4x4 - 1.
+ assert(row4x4 <= row4x4_end_);
+ return row4x4 > row4x4_start_;
+ }
+
+ bool IsTopLeftInside(int row4x4, int column4x4) const {
+ // We use "larger than" as the condition. Don't pass in the top row offset
+ // row4x4 - 1 or the left column offset column4x4 - 1.
+ assert(row4x4 <= row4x4_end_);
+ assert(column4x4 <= column4x4_end_);
+ return row4x4 > row4x4_start_ && column4x4 > column4x4_start_;
+ }
+
+ bool IsBottomRightInside(int row4x4, int column4x4) const {
+ assert(row4x4 >= row4x4_start_);
+ assert(column4x4 >= column4x4_start_);
+ return row4x4 < row4x4_end_ && column4x4 < column4x4_end_;
+ }
+
+ BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const {
+ return block_parameters_holder_.Address(row4x4, column4x4);
+ }
+
+ int BlockParametersStride() const {
+ return block_parameters_holder_.columns4x4();
+ }
+
+ // Returns true if Parameters() can be called with |row| and |column| as
+ // inputs, false otherwise.
+ bool HasParameters(int row, int column) const {
+ return block_parameters_holder_.Find(row, column) != nullptr;
+ }
+ const BlockParameters& Parameters(int row, int column) const {
+ return *block_parameters_holder_.Find(row, column);
+ }
+
+ int number() const { return number_; }
+ int superblock_rows() const { return superblock_rows_; }
+ int superblock_columns() const { return superblock_columns_; }
+ int row4x4_start() const { return row4x4_start_; }
+ int column4x4_start() const { return column4x4_start_; }
+ int column4x4_end() const { return column4x4_end_; }
+
+ private:
+ // Stores the transform tree state when reading variable size transform trees
+ // and when applying the transform tree. When applying the transform tree,
+ // |depth| is not used.
+ struct TransformTreeNode {
+ // The default constructor is invoked by the Stack<TransformTreeNode, n>
+ // constructor. Stack<> does not use the default-constructed elements, so it
+ // is safe for the default constructor to not initialize the members.
+ TransformTreeNode() = default;
+ TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1)
+ : x(x), y(y), tx_size(tx_size), depth(depth) {}
+
+ int x;
+ int y;
+ TransformSize tx_size;
+ int depth;
+ };
+
+ // Enum to track the processing state of a superblock.
+ enum SuperBlockState : uint8_t {
+ kSuperBlockStateNone, // Not yet parsed or decoded.
+ kSuperBlockStateParsed, // Parsed but not yet decoded.
+ kSuperBlockStateScheduled, // Scheduled for decoding.
+ kSuperBlockStateDecoded // Parsed and decoded.
+ };
+
+ // Parameters used to facilitate multi-threading within the Tile.
+ struct ThreadingParameters {
+ std::mutex mutex;
+ // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+ // the processing state of each superblock.
+ Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+ // Variable used to indicate either parse or decode failure.
+ bool abort LIBGAV1_GUARDED_BY(mutex) = false;
+ int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
+ std::condition_variable pending_jobs_zero_condvar;
+ };
+
+ // The residual pointer is used to traverse the |residual_buffer_|. It is
+ // used in two different ways.
+ // If |split_parse_and_decode_| is true:
+ // The pointer points to the beginning of the |residual_buffer_| when the
+ // "parse" and "decode" steps begin. It is then moved forward tx_size in
+ // each iteration of the "parse" and the "decode" steps. In this case, the
+ // ResidualPtr variable passed into various functions starting from
+ // ProcessSuperBlock is used as an in/out parameter to keep track of the
+ // residual pointer.
+ // If |split_parse_and_decode_| is false:
+ // The pointer is reset to the beginning of the |residual_buffer_| for
+ // every transform block.
+ using ResidualPtr = uint8_t*;
+
+ Tile(int tile_number, const uint8_t* data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+ const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+ const dsp::Dsp* dsp, ThreadPool* thread_pool,
+ BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer);
+
+ // Performs member initializations that may fail. Helper function used by
+ // Create().
+ LIBGAV1_MUST_USE_RESULT bool Init();
+
+ // Saves the symbol decoder context of this tile into
+ // |saved_symbol_decoder_context_| if necessary.
+ void SaveSymbolDecoderContext();
+
+ // Entry point for multi-threaded decoding. This function performs the same
+ // functionality as ParseAndDecode(). The current thread does the "parse" step
+ // while the worker threads do the "decode" step.
+ bool ThreadedParseAndDecode();
+
+ // Returns whether or not the prerequisites for decoding the superblock at
+ // |row_index| and |column_index| are satisfied. |threading_.mutex| must be
+ // held when calling this function.
+ bool CanDecode(int row_index, int column_index) const;
+
+ // This function is run by the worker threads when multi-threaded decoding is
+ // enabled. Once a superblock is decoded, this function will set the
+ // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On
+ // failure, |threading_.abort| will be set to true. If at any point
+ // |threading_.abort| becomes true, this function will return as early as it
+ // can. If the decoding succeeds, this function will also schedule the
+ // decoding jobs for the superblock to the bottom-left and the superblock to
+ // the right of this superblock (if it is allowed).
+ void DecodeSuperBlock(int row_index, int column_index, int block_width4x4);
+
+ // If |use_intra_prediction_buffer_| is true, then this function copies the
+ // last row of the superblockrow starting at |row4x4| into the
+ // |intra_prediction_buffer_| (which may be used by the intra prediction
+ // process for the next superblock row).
+ void PopulateIntraPredictionBuffer(int row4x4);
+
+ uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size);
+ bool ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+ bool has_rows, bool has_columns, Partition* partition);
+ // Processes the Partition starting at |row4x4_start|, |column4x4_start|
+ // iteratively. It performs a DFS traversal over the partition tree to process
+ // the blocks in the right order.
+ bool ProcessPartition(
+ int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual); // Iterative implementation of 5.11.4.
+ bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* scratch_buffer,
+ ResidualPtr* residual); // 5.11.5.
+ void ResetCdef(int row4x4, int column4x4); // 5.11.55.
+
+ // This function is used to decode a superblock when the parsing has already
+ // been done for that superblock.
+ bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* scratch_buffer);
+ // Helper function used by DecodeSuperBlock(). Note that the decode_block()
+ // function in the spec is equivalent to ProcessBlock() in the code.
+ bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
+
+ void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
+ int column4x4); // 5.11.3.
+ bool ProcessSuperBlock(int row4x4, int column4x4,
+ TileScratchBuffer* scratch_buffer,
+ ProcessingMode mode);
+ void ResetLoopRestorationParams();
+ void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+ BlockSize block_size); // 5.11.57.
+
+ // Helper functions for DecodeBlock.
+ bool ReadSegmentId(const Block& block); // 5.11.9.
+ bool ReadIntraSegmentId(const Block& block); // 5.11.8.
+ void ReadSkip(const Block& block); // 5.11.11.
+ bool ReadSkipMode(const Block& block); // 5.11.10.
+ void ReadCdef(const Block& block); // 5.11.56.
+ // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1.
+ int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value,
+ int max_value, int value);
+ void ReadQuantizerIndexDelta(const Block& block); // 5.11.12.
+ void ReadLoopFilterDelta(const Block& block); // 5.11.13.
+ // Populates |BlockParameters::deblock_filter_level| for the given |block|
+ // using |deblock_filter_levels_|.
+ void PopulateDeblockFilterLevel(const Block& block);
+ void PopulateCdefSkip(const Block& block);
+ void ReadPredictionModeY(const Block& block, bool intra_y_mode);
+ void ReadIntraAngleInfo(const Block& block,
+ PlaneType plane_type); // 5.11.42 and 5.11.43.
+ void ReadPredictionModeUV(const Block& block);
+ void ReadCflAlpha(const Block& block); // 5.11.45.
+ int GetPaletteCache(const Block& block, PlaneType plane_type,
+ uint16_t* cache);
+ void ReadPaletteColors(const Block& block, Plane plane);
+ void ReadPaletteModeInfo(const Block& block); // 5.11.46.
+ void ReadFilterIntraModeInfo(const Block& block); // 5.11.24.
+ int ReadMotionVectorComponent(const Block& block,
+ int component); // 5.11.32.
+ void ReadMotionVector(const Block& block, int index); // 5.11.31.
+ bool DecodeIntraModeInfo(const Block& block); // 5.11.7.
+ int8_t ComputePredictedSegmentId(const Block& block) const; // 5.11.21.
+ bool ReadInterSegmentId(const Block& block, bool pre_skip); // 5.11.19.
+ void ReadIsInter(const Block& block, bool skip_mode); // 5.11.20.
+ bool ReadIntraBlockModeInfo(const Block& block,
+ bool intra_y_mode); // 5.11.22.
+ CompoundReferenceType ReadCompoundReferenceType(const Block& block);
+ template <bool is_single, bool is_backward, int index>
+ uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type =
+ kNumCompoundReferenceTypes);
+ void ReadReferenceFrames(const Block& block, bool skip_mode); // 5.11.25.
+ void ReadInterPredictionModeY(const Block& block,
+ const MvContexts& mode_contexts,
+ bool skip_mode);
+ void ReadRefMvIndex(const Block& block);
+ void ReadInterIntraMode(const Block& block, bool is_compound,
+ bool skip_mode); // 5.11.28.
+ bool IsScaled(ReferenceFrameType type) const { // Part of 5.11.27.
+ const int index =
+ frame_header_.reference_frame_index[type - kReferenceFrameLast];
+ return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+ reference_frames_[index]->frame_height() != frame_header_.height;
+ }
+ void ReadMotionMode(const Block& block, bool is_compound,
+ bool skip_mode); // 5.11.27.
+ uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
+ uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
+ void ReadCompoundType(const Block& block, bool is_compound, bool skip_mode,
+ bool* is_explicit_compound_type,
+ bool* is_compound_type_average); // 5.11.29.
+ uint16_t* GetInterpolationFilterCdf(const Block& block, int direction);
+ void ReadInterpolationFilter(const Block& block, bool skip_mode);
+ bool ReadInterBlockModeInfo(const Block& block, bool skip_mode); // 5.11.23.
+ bool DecodeInterModeInfo(const Block& block); // 5.11.18.
+ bool DecodeModeInfo(const Block& block); // 5.11.6.
+ bool IsMvValid(const Block& block, bool is_compound) const; // 6.10.25.
+ bool AssignInterMv(const Block& block, bool is_compound); // 5.11.26.
+ bool AssignIntraMv(const Block& block); // 5.11.26.
+ int GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip);
+ int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip);
+ TransformSize ReadFixedTransformSize(const Block& block); // 5.11.15.
+ // Iterative implementation of 5.11.17.
+ void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4,
+ TransformSize tx_size);
+ void DecodeTransformSize(const Block& block); // 5.11.16.
+ bool ComputePrediction(const Block& block); // 5.11.33.
+ // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and
+ // |h4| are the width and height in 4x4 units of |tx_size|.
+ int GetTransformAllZeroContext(const Block& block, Plane plane,
+ TransformSize tx_size, int x4, int y4, int w4,
+ int h4);
+ TransformSet GetTransformSet(TransformSize tx_size,
+ bool is_inter) const; // 5.11.48.
+ TransformType ComputeTransformType(const Block& block, Plane plane,
+ TransformSize tx_size, int block_x,
+ int block_y); // 5.11.40.
+ void ReadTransformType(const Block& block, int x4, int y4,
+ TransformSize tx_size); // 5.11.47.
+ template <typename ResidualType>
+ void ReadCoeffBase2D(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ template <typename ResidualType>
+ void ReadCoeffBaseHorizontal(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ template <typename ResidualType>
+ void ReadCoeffBaseVertical(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
+ int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
+ void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+ uint8_t coefficient_level, int8_t dc_category);
+ void InterIntraPrediction(
+ uint16_t* prediction_0, const uint8_t* prediction_mask,
+ ptrdiff_t prediction_mask_stride,
+ const PredictionParameters& prediction_parameters, int prediction_width,
+ int prediction_height, int subsampling_x, int subsampling_y,
+ uint8_t* dest,
+ ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec.
+ void CompoundInterPrediction(
+ const Block& block, const uint8_t* prediction_mask,
+ ptrdiff_t prediction_mask_stride, int prediction_width,
+ int prediction_height, int subsampling_x, int subsampling_y,
+ int candidate_row, int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec.
+ GlobalMotion* GetWarpParams(const Block& block, Plane plane,
+ int prediction_width, int prediction_height,
+ const PredictionParameters& prediction_parameters,
+ ReferenceFrameType reference_type,
+ bool* is_local_valid,
+ GlobalMotion* global_motion_params,
+ GlobalMotion* local_warp_params)
+ const; // Part of section 7.11.3.1 in the spec.
+ bool InterPrediction(const Block& block, Plane plane, int x, int y,
+ int prediction_width, int prediction_height,
+ int candidate_row, int candidate_column,
+ bool* is_local_valid,
+ GlobalMotion* local_warp_params); // 7.11.3.1.
+ void ScaleMotionVector(const MotionVector& mv, Plane plane,
+ int reference_frame_index, int x, int y, int* start_x,
+ int* start_y, int* step_x, int* step_y); // 7.11.3.3.
+ // If the method returns false, the caller only uses the output parameters
+ // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
+ // caller uses all four output parameters.
+ static bool GetReferenceBlockPosition(
+ int reference_frame_index, bool is_scaled, int width, int height,
+ int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
+ int start_x, int start_y, int step_x, int step_y, int left_border,
+ int right_border, int top_border, int bottom_border,
+ int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x,
+ int* ref_block_end_y);
+
+ template <typename Pixel>
+ void BuildConvolveBlock(Plane plane, int reference_frame_index,
+ bool is_scaled, int height, int ref_start_x,
+ int ref_last_x, int ref_start_y, int ref_last_y,
+ int step_y, int ref_block_start_x,
+ int ref_block_end_x, int ref_block_start_y,
+ uint8_t* block_buffer,
+ ptrdiff_t convolve_buffer_stride,
+ ptrdiff_t block_extended_width);
+ bool BlockInterPrediction(const Block& block, Plane plane,
+ int reference_frame_index, const MotionVector& mv,
+ int x, int y, int width, int height,
+ int candidate_row, int candidate_column,
+ uint16_t* prediction, bool is_compound,
+ bool is_inter_intra, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.4.
+ bool BlockWarpProcess(const Block& block, Plane plane, int index,
+ int block_start_x, int block_start_y, int width,
+ int height, GlobalMotion* warp_params, bool is_compound,
+ bool is_inter_intra, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.5.
+ bool ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+ Plane plane, int reference_frame_index, int width,
+ int height, int x, int y, int candidate_row,
+ int candidate_column,
+ ObmcDirection blending_direction);
+ bool ObmcPrediction(const Block& block, Plane plane, int width,
+ int height); // 7.11.3.9.
+ void DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+ int width, int height, int candidate_row,
+ int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride); // 7.11.3.15.
+ // This function specializes the parsing of DC coefficient by removing some of
+ // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always
+ // non-zero for all other possible values of i). |dc_category| is an output
+ // parameter that is populated when |is_dc_coefficient| is true.
+ // |coefficient_level| is an output parameter which accumulates the
+ // coefficient level.
+ template <typename ResidualType, bool is_dc_coefficient>
+ LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization(
+ const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
+ int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
+ int* coefficient_level,
+ ResidualType* residual_buffer); // Part of 5.11.39.
+ int ReadCoeffBaseRange(uint16_t* cdf); // Part of 5.11.39.
+ // Returns the number of non-zero coefficients that were read. |tx_type| is an
+ // output parameter that stores the computed transform type for the plane
+ // whose coefficients were read. Returns -1 on failure.
+ template <typename ResidualType>
+ int ReadTransformCoefficients(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType* tx_type); // 5.11.39.
+ bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y,
+ TransformSize tx_size, int x, int y,
+ ProcessingMode mode); // 5.11.35.
+ // Iterative implementation of 5.11.36.
+ bool TransformTree(const Block& block, int start_x, int start_y,
+ BlockSize plane_size, ProcessingMode mode);
+ void ReconstructBlock(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType tx_type,
+ int non_zero_coeff_count); // Part of 7.12.3.
+ bool Residual(const Block& block, ProcessingMode mode); // 5.11.34.
+ // part of 5.11.5 (reset_block_context() in the spec).
+ void ResetEntropyContext(const Block& block);
+ // Populates the |color_context| and |color_order| for the |i|th iteration
+ // with entries counting down from |start| to |end| (|start| > |end|).
+ void PopulatePaletteColorContexts(
+ const Block& block, PlaneType plane_type, int i, int start, int end,
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+ uint8_t color_context[kMaxPaletteSquare]); // 5.11.50.
+ bool ReadPaletteTokens(const Block& block); // 5.11.49.
+ template <typename Pixel>
+ void IntraPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool has_top_right,
+ bool has_bottom_left, PredictionMode mode,
+ TransformSize tx_size);
+ int GetIntraEdgeFilterType(const Block& block,
+ Plane plane) const; // 7.11.2.8.
+ template <typename Pixel>
+ void DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool needs_left,
+ bool needs_top, int prediction_angle, int width,
+ int height, int max_x, int max_y,
+ TransformSize tx_size, Pixel* top_row,
+ Pixel* left_column); // 7.11.2.4.
+ template <typename Pixel>
+ void PalettePrediction(const Block& block, Plane plane, int start_x,
+ int start_y, int x, int y,
+ TransformSize tx_size); // 7.11.4.
+ template <typename Pixel>
+ void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x,
+ int start_y,
+ TransformSize tx_size); // 7.11.5.
+ // Section 7.19. Applies some filtering and reordering to the motion vectors
+ // for the given |block| and stores them into |current_frame_|.
+ void StoreMotionFieldMvsIntoCurrentFrame(const Block& block);
+
+ // SetCdfContext*() functions will populate the |left_context_| and
+ // |top_context_| for the |block|.
+ void SetCdfContextUsePredictedSegmentId(const Block& block,
+ bool use_predicted_segment_id);
+ void SetCdfContextCompoundType(const Block& block,
+ bool is_explicit_compound_type,
+ bool is_compound_type_average);
+ void SetCdfContextSkipMode(const Block& block, bool skip_mode);
+ void SetCdfContextPaletteSize(const Block& block);
+ void SetCdfContextUVMode(const Block& block);
+
+ // Returns the zero-based index of the super block that contains |row4x4|
+ // relative to the start of this tile.
+ int SuperBlockRowIndex(int row4x4) const {
+ return (row4x4 - row4x4_start_) >>
+ (sequence_header_.use_128x128_superblock ? 5 : 4);
+ }
+
+ // Returns the zero-based index of the super block that contains |column4x4|
+ // relative to the start of this tile.
+ int SuperBlockColumnIndex(int column4x4) const {
+ return (column4x4 - column4x4_start_) >>
+ (sequence_header_.use_128x128_superblock ? 5 : 4);
+ }
+
+ // Returns the zero-based index of the block that starts at row4x4 or
+ // column4x4 relative to the start of the superblock that contains the block.
+ // This is used to index into the members of |left_context_| and
+ // |top_context_|.
+ int CdfContextIndex(int row_or_column4x4) const {
+ return row_or_column4x4 -
+ (row_or_column4x4 &
+ (sequence_header_.use_128x128_superblock ? ~31 : ~15));
+ }
+
+ BlockSize SuperBlockSize() const {
+ return sequence_header_.use_128x128_superblock ? kBlock128x128
+ : kBlock64x64;
+ }
+ int PlaneCount() const {
+ return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ }
+
+ const int number_;
+ const int row_;
+ const int column_;
+ const uint8_t* const data_;
+ size_t size_;
+ int row4x4_start_;
+ int row4x4_end_;
+ int column4x4_start_;
+ int column4x4_end_;
+ int superblock_rows_;
+ int superblock_columns_;
+ bool read_deltas_;
+ const int8_t subsampling_x_[kMaxPlanes];
+ const int8_t subsampling_y_[kMaxPlanes];
+
+ // The dimensions (in order) are: segment_id, level_index (based on plane and
+ // direction), reference_frame and mode_id.
+ uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+ [kNumReferenceFrameTypes][2];
+
+ // current_quantizer_index_ is in the range [0, 255].
+ uint8_t current_quantizer_index_;
+ // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to
+ // store the entropy context. Their dimensions are as follows: First -
+ // left/top; Second - plane; Third - row4x4 (if first dimension is
+ // left)/column4x4 (if first dimension is top).
+ //
+ // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in
+ // the spec. In the spec, it stores values from 0 through 63 (inclusive). The
+ // stored values are used to compute the left and top contexts in
+ // GetTransformAllZeroContext. In that function, we only care about the
+ // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we
+ // clamp to 4 (i.e.) all the values greater than 4 are stored as 4.
+ std::array<Array2D<uint8_t>, 2> coefficient_levels_;
+ // This is equivalent to the LeftDcContext and AboveDcContext arrays in the
+ // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1
+ // means the value is < 0, 2 means the value is > 0 and 0 means the value is
+ // equal to 0).
+ //
+ // The stored values are used in two places:
+ // * GetTransformAllZeroContext: Here, we only care about whether the
+ // value is 0 or not (whether it is 1 or 2 is irrelevant).
+ // * GetDcSignContext: Here, we do the following computation: if the
+ // stored value is 1, we decrement a counter. If the stored value is 2
+ // we increment a counter.
+ //
+ // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and
+ // use that value to compute the counter.
+ //
+ // The usage on GetTransformAllZeroContext is unaffected since there we
+ // only care about whether it is 0 or not.
+ std::array<Array2D<int8_t>, 2> dc_categories_;
+ const ObuSequenceHeader& sequence_header_;
+ const ObuFrameHeader& frame_header_;
+ const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias_;
+ const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+ reference_frames_;
+ TemporalMotionField& motion_field_;
+ const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
+ const WedgeMaskArray& wedge_masks_;
+ const QuantizerMatrix& quantizer_matrix_;
+ EntropyDecoder reader_;
+ SymbolDecoderContext symbol_decoder_context_;
+ SymbolDecoderContext* const saved_symbol_decoder_context_;
+ const SegmentationMap* prev_segment_ids_;
+ const dsp::Dsp& dsp_;
+ PostFilter& post_filter_;
+ BlockParametersHolder& block_parameters_holder_;
+ Quantizer quantizer_;
+ // When there is no multi-threading within the Tile, |residual_buffer_| is
+ // used. When there is multi-threading within the Tile,
+ // |residual_buffer_threaded_| is used. In the following comment,
+ // |residual_buffer| refers to either |residual_buffer_| or
+ // |residual_buffer_threaded_| depending on whether multi-threading is enabled
+ // within the Tile or not.
+ // The |residual_buffer| is used to help with the dequantization and the
+ // inverse transform processes. It is declared as a uint8_t, but is always
+ // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is
+ // what it stores at various stages of the decoding process (in the order
+ // which they happen):
+ // 1) In ReadTransformCoefficients(), this buffer is used to store the
+ // dequantized values.
+ // 2) In Reconstruct(), this buffer is used as the input to the row
+ // transform process.
+ // The size of this buffer would be:
+ // For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) *
+ // |residual_size_|. Where 4096 = 64x64 which is the maximum transform
+ // size, and 32 * |kResidualPaddingVertical| is the padding to avoid
+ // bottom boundary checks when parsing quantized coefficients. This
+ // memory is allocated and owned by the Tile class.
+ // For |residual_buffer_threaded_|: See the comment below. This memory is
+ // not allocated or owned by the Tile class.
+ AlignedUniquePtr<uint8_t> residual_buffer_;
+ // This is a 2d array of pointers of size |superblock_rows_| by
+ // |superblock_columns_| where each pointer points to a ResidualBuffer for a
+ // single super block. The array is populated when the parsing process begins
+ // by calling |residual_buffer_pool_->Get()| and the memory is released back
+ // to the pool by calling |residual_buffer_pool_->Release()| when the decoding
+ // process is complete.
+ Array2D<std::unique_ptr<ResidualBuffer>> residual_buffer_threaded_;
+ // sizeof(int16_t or int32_t) depending on |bitdepth|.
+ const size_t residual_size_;
+ // Number of superblocks on the top-right that will have to be decoded before
+ // the current superblock can be decoded. This will be 1 if allow_intrabc is
+ // false. If allow_intrabc is true, then this value will be
+ // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for
+ // the top rows for intrabc.
+ const int intra_block_copy_lag_;
+
+ // In the Tile class, we use the "current_frame" in two ways:
+ // 1) To write the decoded output into (using the |buffer_| view).
+ // 2) To read the pixels for intra block copy (using the |current_frame_|
+ // reference).
+ //
+ // When intra block copy is off, |buffer_| and |current_frame_| may or may not
+ // point to the same plane pointers. But it is okay since |current_frame_| is
+ // never used in this case.
+ //
+ // When intra block copy is on, |buffer_| and |current_frame_| always point to
+ // the same plane pointers (since post filtering is disabled). So the usage in
+ // both case 1 and case 2 remain valid.
+ Array2DView<uint8_t> buffer_[kMaxPlanes];
+ RefCountedBuffer& current_frame_;
+
+ Array2D<int8_t>& cdef_index_;
+ Array2D<uint8_t>& cdef_skip_;
+ Array2D<TransformSize>& inter_transform_sizes_;
+ std::array<RestorationUnitInfo, kMaxPlanes> reference_unit_info_;
+ // If |thread_pool_| is nullptr, the calling thread will do the parsing and
+ // the decoding in one pass. If |thread_pool_| is not nullptr, then the main
+ // thread will do the parsing while the thread pool workers will do the
+ // decoding.
+ ThreadPool* const thread_pool_;
+ ThreadingParameters threading_;
+ ResidualBufferPool* const residual_buffer_pool_;
+ TileScratchBufferPool* const tile_scratch_buffer_pool_;
+ BlockingCounterWithStatus* const pending_tiles_;
+ bool split_parse_and_decode_;
+ // This is used only when |split_parse_and_decode_| is false.
+ std::unique_ptr<PredictionParameters> prediction_parameters_ = nullptr;
+ // Stores the |transform_type| for the super block being decoded at a 4x4
+ // granularity. The spec uses absolute indices for this array but it is
+ // sufficient to use indices relative to the super block being decoded.
+ TransformType transform_types_[32][32];
+ // delta_lf_[i] is in the range [-63, 63].
+ int8_t delta_lf_[kFrameLfCount];
+ // True if all the values in |delta_lf_| are zero. False otherwise.
+ bool delta_lf_all_zero_;
+ const bool frame_parallel_;
+ const bool use_intra_prediction_buffer_;
+ // Buffer used to store the unfiltered pixels that are necessary for decoding
+ // the next superblock row (for the intra prediction process). Used only if
+ // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+ // one row buffer for each tile row. This tile will have to use the buffer
+ // corresponding to this tile's row.
+ IntraPredictionBuffer* const intra_prediction_buffer_;
+ // Stores the progress of the reference frames. This will be used to avoid
+ // unnecessary calls into RefCountedBuffer::WaitUntil().
+ std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
+ // Stores the CDF contexts necessary for the "left" block.
+ BlockCdfContext left_context_;
+ // Stores the CDF contexts necessary for the "top" block. The size of this
+ // buffer is the number of superblock columns in this tile. For each block,
+ // the access index will be the corresponding SuperBlockColumnIndex()'th
+ // entry.
+ DynamicBuffer<BlockCdfContext> top_context_;
+};
+
+struct Tile::Block {
+ Block(Tile* tile_ptr, BlockSize size, int row4x4, int column4x4,
+ TileScratchBuffer* const scratch_buffer, ResidualPtr* residual)
+ : tile(*tile_ptr),
+ size(size),
+ row4x4(row4x4),
+ column4x4(column4x4),
+ width(kBlockWidthPixels[size]),
+ height(kBlockHeightPixels[size]),
+ width4x4(width >> 2),
+ height4x4(height >> 2),
+ scratch_buffer(scratch_buffer),
+ residual(residual),
+ top_context(tile.top_context_.get() +
+ tile.SuperBlockColumnIndex(column4x4)),
+ top_context_index(tile.CdfContextIndex(column4x4)),
+ left_context_index(tile.CdfContextIndex(row4x4)) {
+ assert(size != kBlockInvalid);
+ residual_size[kPlaneY] = kPlaneResidualSize[size][0][0];
+ residual_size[kPlaneU] = residual_size[kPlaneV] =
+ kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]]
+ [tile.subsampling_y_[kPlaneU]];
+ assert(residual_size[kPlaneY] != kBlockInvalid);
+ if (tile.PlaneCount() > 1) {
+ assert(residual_size[kPlaneU] != kBlockInvalid);
+ }
+ if ((row4x4 & 1) == 0 &&
+ (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) {
+ has_chroma = false;
+ } else if ((column4x4 & 1) == 0 &&
+ (tile.sequence_header_.color_config.subsampling_x & width4x4) ==
+ 1) {
+ has_chroma = false;
+ } else {
+ has_chroma = !tile.sequence_header_.color_config.is_monochrome;
+ }
+ top_available[kPlaneY] = tile.IsTopInside(row4x4);
+ left_available[kPlaneY] = tile.IsLeftInside(column4x4);
+ if (has_chroma) {
+ // top_available[kPlaneU] and top_available[kPlaneV] are valid only if
+ // has_chroma is true.
+ // The next 3 lines are equivalent to:
+ // top_available[kPlaneU] = top_available[kPlaneV] =
+ // top_available[kPlaneY] &&
+ // ((tile.sequence_header_.color_config.subsampling_y & height4x4) ==
+ // 0 || tile.IsTopInside(row4x4 - 1));
+ top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside(
+ row4x4 -
+ (tile.sequence_header_.color_config.subsampling_y & height4x4));
+ // left_available[kPlaneU] and left_available[kPlaneV] are valid only if
+ // has_chroma is true.
+ // The next 3 lines are equivalent to:
+ // left_available[kPlaneU] = left_available[kPlaneV] =
+ // left_available[kPlaneY] &&
+ // ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0
+ // || tile.IsLeftInside(column4x4 - 1));
+ left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside(
+ column4x4 -
+ (tile.sequence_header_.color_config.subsampling_x & width4x4));
+ }
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** const bps =
+ tile.BlockParametersAddress(row4x4, column4x4);
+ bp = *bps;
+ // bp_top is valid only if top_available[kPlaneY] is true.
+ if (top_available[kPlaneY]) {
+ bp_top = *(bps - stride);
+ }
+ // bp_left is valid only if left_available[kPlaneY] is true.
+ if (left_available[kPlaneY]) {
+ bp_left = *(bps - 1);
+ }
+ }
+
+ bool HasChroma() const { return has_chroma; }
+
+ // These return values of these group of functions are valid only if the
+ // corresponding top_available or left_available is true.
+ ReferenceFrameType TopReference(int index) const {
+ return bp_top->reference_frame[index];
+ }
+
+ ReferenceFrameType LeftReference(int index) const {
+ return bp_left->reference_frame[index];
+ }
+
+ bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; }
+ bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; }
+
+ bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; }
+ bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; }
+
+ int CountReferences(ReferenceFrameType type) const {
+ return static_cast<int>(top_available[kPlaneY] &&
+ bp_top->reference_frame[0] == type) +
+ static_cast<int>(top_available[kPlaneY] &&
+ bp_top->reference_frame[1] == type) +
+ static_cast<int>(left_available[kPlaneY] &&
+ bp_left->reference_frame[0] == type) +
+ static_cast<int>(left_available[kPlaneY] &&
+ bp_left->reference_frame[1] == type);
+ }
+
+ // 7.10.3.
+ // Checks if there are any inter blocks to the left or above. If so, it
+ // returns true indicating that the block has neighbors that are suitable for
+ // use by overlapped motion compensation.
+ bool HasOverlappableCandidates() const {
+ const ptrdiff_t stride = tile.BlockParametersStride();
+ BlockParameters** const bps = tile.BlockParametersAddress(0, 0);
+ if (top_available[kPlaneY]) {
+ BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1);
+ const int columns = std::min(tile.frame_header_.columns4x4 - column4x4,
+ static_cast<int>(width4x4));
+ BlockParameters** const bps_top_end = bps_top + columns;
+ do {
+ if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) {
+ return true;
+ }
+ bps_top += 2;
+ } while (bps_top < bps_top_end);
+ }
+ if (left_available[kPlaneY]) {
+ BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1;
+ const int rows = std::min(tile.frame_header_.rows4x4 - row4x4,
+ static_cast<int>(height4x4));
+ BlockParameters** const bps_left_end = bps_left + rows * stride;
+ do {
+ if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) {
+ return true;
+ }
+ bps_left += 2 * stride;
+ } while (bps_left < bps_left_end);
+ }
+ return false;
+ }
+
+ Tile& tile;
+ bool has_chroma;
+ const BlockSize size;
+ bool top_available[kMaxPlanes];
+ bool left_available[kMaxPlanes];
+ BlockSize residual_size[kMaxPlanes];
+ const int row4x4;
+ const int column4x4;
+ const int width;
+ const int height;
+ const int width4x4;
+ const int height4x4;
+ const BlockParameters* bp_top;
+ const BlockParameters* bp_left;
+ BlockParameters* bp;
+ TileScratchBuffer* const scratch_buffer;
+ ResidualPtr* const residual;
+ BlockCdfContext* const top_context;
+ const int top_context_index;
+ const int left_context_index;
+};
+
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_TILE_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kDeltaQSmall = 3;
+constexpr int kDeltaLfSmall = 3;
+
+constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
+
+constexpr uint8_t kSizeGroup[kMaxBlockSizes] = {
+ 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3};
+
+constexpr int kCompoundModeNewMvContexts = 5;
+constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = {
+ {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}};
+
+enum CflSign : uint8_t {
+ kCflSignZero = 0,
+ kCflSignNegative = 1,
+ kCflSignPositive = 2
+};
+
+// For each possible value of the combined signs (which is read from the
+// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context,
+// alpha_v_context. Only positive entries are used. Entry at index i is computed
+// as follows:
+// sign_u = i / 3
+// sign_v = i % 3
+// alpha_u_context = i - 2
+// alpha_v_context = (sign_v - 1) * 3 + sign_u
+constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = {
+ {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1},
+ {1, 2, 2, 4}, {2, 0, 3, -1}, {2, 1, 4, 2}, {2, 2, 5, 5},
+};
+
+constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv,
+ kPredictionModeNearNearMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv);
+
+constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16,
+ kBlock16x8, kBlock16x16,
+ kBlock16x32, kBlock32x16,
+ kBlock32x32);
+
+bool IsBackwardReference(ReferenceFrameType type) {
+ return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate;
+}
+
+bool IsSameDirectionReferencePair(ReferenceFrameType type1,
+ ReferenceFrameType type2) {
+ return (type1 >= kReferenceFrameBackward) ==
+ (type2 >= kReferenceFrameBackward);
+}
+
+// This is called neg_deinterleave() in the spec.
+int DecodeSegmentId(int diff, int reference, int max) {
+ if (reference == 0) return diff;
+ if (reference >= max - 1) return max - diff - 1;
+ const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1)
+ : reference - (diff >> 1);
+ const int reference2 = (reference << 1);
+ if (reference2 < max) {
+ return (diff <= reference2) ? value : diff;
+ }
+ return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1);
+}
+
+// This is called DrlCtxStack in section 7.10.2.14 of the spec.
+// In the spec, the weights of all the nearest mvs are incremented by a bonus
+// weight which is larger than any natural weight, and the weights of the mvs
+// are compared with this bonus weight to determine their contexts. We replace
+// this procedure by introducing |nearest_mv_count| in PredictionParameters,
+// which records the count of the nearest mvs. Since all the nearest mvs are in
+// the beginning of the mv stack, the |index| of a mv in the mv stack can be
+// compared with |nearest_mv_count| to get that mv's context.
+int GetRefMvIndexContext(int nearest_mv_count, int index) {
+ if (index + 1 < nearest_mv_count) {
+ return 0;
+ }
+ if (index + 1 == nearest_mv_count) {
+ return 1;
+ }
+ return 2;
+}
+
+// Returns true if both the width and height of the block is less than 64.
+bool IsBlockDimensionLessThan64(BlockSize size) {
+ return size <= kBlock32x32 && size != kBlock16x64;
+}
+
+int GetUseCompoundReferenceContext(const Tile::Block& block) {
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ if (block.IsTopSingle() && block.IsLeftSingle()) {
+ return static_cast<int>(IsBackwardReference(block.TopReference(0))) ^
+ static_cast<int>(IsBackwardReference(block.LeftReference(0)));
+ }
+ if (block.IsTopSingle()) {
+ return 2 + static_cast<int>(IsBackwardReference(block.TopReference(0)) ||
+ block.IsTopIntra());
+ }
+ if (block.IsLeftSingle()) {
+ return 2 + static_cast<int>(IsBackwardReference(block.LeftReference(0)) ||
+ block.IsLeftIntra());
+ }
+ return 4;
+ }
+ if (block.top_available[kPlaneY]) {
+ return block.IsTopSingle()
+ ? static_cast<int>(IsBackwardReference(block.TopReference(0)))
+ : 3;
+ }
+ if (block.left_available[kPlaneY]) {
+ return block.IsLeftSingle()
+ ? static_cast<int>(IsBackwardReference(block.LeftReference(0)))
+ : 3;
+ }
+ return 1;
+}
+
+// Calculates count0 by calling block.CountReferences() on the frame types from
+// type0_start to type0_end, inclusive, and summing the results.
+// Calculates count1 by calling block.CountReferences() on the frame types from
+// type1_start to type1_end, inclusive, and summing the results.
+// Compares count0 with count1 and returns 0, 1 or 2.
+//
+// See count_refs and ref_count_ctx in 8.3.2.
+int GetReferenceContext(const Tile::Block& block,
+ ReferenceFrameType type0_start,
+ ReferenceFrameType type0_end,
+ ReferenceFrameType type1_start,
+ ReferenceFrameType type1_end) {
+ int count0 = 0;
+ int count1 = 0;
+ for (int type = type0_start; type <= type0_end; ++type) {
+ count0 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+ }
+ for (int type = type1_start; type <= type1_end; ++type) {
+ count1 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+ }
+ return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2);
+}
+
+} // namespace
+
+bool Tile::ReadSegmentId(const Block& block) {
+ // These two asserts ensure that current_frame_.segmentation_map() is not
+ // nullptr.
+ assert(frame_header_.segmentation.enabled);
+ assert(frame_header_.segmentation.update_map);
+ const SegmentationMap& map = *current_frame_.segmentation_map();
+ int top_left = -1;
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ top_left = map.segment_id(block.row4x4 - 1, block.column4x4 - 1);
+ }
+ int top = -1;
+ if (block.top_available[kPlaneY]) {
+ top = map.segment_id(block.row4x4 - 1, block.column4x4);
+ }
+ int left = -1;
+ if (block.left_available[kPlaneY]) {
+ left = map.segment_id(block.row4x4, block.column4x4 - 1);
+ }
+ int pred;
+ if (top == -1) {
+ pred = (left == -1) ? 0 : left;
+ } else if (left == -1) {
+ pred = top;
+ } else {
+ pred = (top_left == top) ? top : left;
+ }
+ BlockParameters& bp = *block.bp;
+ if (bp.skip) {
+ bp.prediction_parameters->segment_id = pred;
+ return true;
+ }
+ int context = 0;
+ if (top_left < 0) {
+ context = 0;
+ } else if (top_left == top && top_left == left) {
+ context = 2;
+ } else if (top_left == top || top_left == left || top == left) {
+ context = 1;
+ }
+ uint16_t* const segment_id_cdf =
+ symbol_decoder_context_.segment_id_cdf[context];
+ const int encoded_segment_id =
+ reader_.ReadSymbol<kMaxSegments>(segment_id_cdf);
+ bp.prediction_parameters->segment_id =
+ DecodeSegmentId(encoded_segment_id, pred,
+ frame_header_.segmentation.last_active_segment_id + 1);
+ // Check the bitstream conformance requirement in Section 6.10.8 of the spec.
+ if (bp.prediction_parameters->segment_id < 0 ||
+ bp.prediction_parameters->segment_id >
+ frame_header_.segmentation.last_active_segment_id) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d",
+ encoded_segment_id, frame_header_.segmentation.last_active_segment_id,
+ bp.prediction_parameters->segment_id);
+ return false;
+ }
+ return true;
+}
+
+bool Tile::ReadIntraSegmentId(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.segmentation.enabled) {
+ bp.prediction_parameters->segment_id = 0;
+ return true;
+ }
+ return ReadSegmentId(block);
+}
+
+void Tile::ReadSkip(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.segmentation.segment_id_pre_skip &&
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureSkip)) {
+ bp.skip = true;
+ return;
+ }
+ int context = 0;
+ if (block.top_available[kPlaneY] && block.bp_top->skip) {
+ ++context;
+ }
+ if (block.left_available[kPlaneY] && block.bp_left->skip) {
+ ++context;
+ }
+ uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context];
+ bp.skip = reader_.ReadSymbol(skip_cdf);
+}
+
+bool Tile::ReadSkipMode(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.skip_mode_present ||
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id,
+ kSegmentFeatureReferenceFrame) ||
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv) ||
+ IsBlockDimension4(block.size)) {
+ return false;
+ }
+ const int context =
+ (block.left_available[kPlaneY]
+ ? static_cast<int>(left_context_.skip_mode[block.left_context_index])
+ : 0) +
+ (block.top_available[kPlaneY]
+ ? static_cast<int>(
+ block.top_context->skip_mode[block.top_context_index])
+ : 0);
+ return reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]);
+}
+
+void Tile::ReadCdef(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (bp.skip || frame_header_.coded_lossless ||
+ !sequence_header_.enable_cdef || frame_header_.allow_intrabc ||
+ frame_header_.cdef.bits == 0) {
+ return;
+ }
+ int8_t* const cdef_index =
+ &cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)];
+ int stride = cdef_index_.columns();
+ if (cdef_index[0] == -1) {
+ cdef_index[0] =
+ static_cast<int8_t>(reader_.ReadLiteral(frame_header_.cdef.bits));
+ if (block.size == kBlock128x128) {
+ // This condition is shorthand for block.width4x4 > 16 && block.height4x4
+ // > 16.
+ cdef_index[1] = cdef_index[0];
+ cdef_index[stride] = cdef_index[0];
+ cdef_index[stride + 1] = cdef_index[0];
+ } else if (block.width4x4 > 16) {
+ cdef_index[1] = cdef_index[0];
+ } else if (block.height4x4 > 16) {
+ cdef_index[stride] = cdef_index[0];
+ }
+ }
+}
+
+int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale,
+ int min_value, int max_value, int value) {
+ int abs = reader_.ReadSymbol<kDeltaSymbolCount>(cdf);
+ if (abs == delta_small) {
+ const int remaining_bit_count =
+ static_cast<int>(reader_.ReadLiteral(3)) + 1;
+ const int abs_remaining_bits =
+ static_cast<int>(reader_.ReadLiteral(remaining_bit_count));
+ abs = abs_remaining_bits + (1 << remaining_bit_count) + 1;
+ }
+ if (abs != 0) {
+ const bool sign = reader_.ReadBit() != 0;
+ const int scaled_abs = abs << scale;
+ const int reduced_delta = sign ? -scaled_abs : scaled_abs;
+ value += reduced_delta;
+ value = Clip3(value, min_value, max_value);
+ }
+ return value;
+}
+
+void Tile::ReadQuantizerIndexDelta(const Block& block) {
+ assert(read_deltas_);
+ BlockParameters& bp = *block.bp;
+ if ((block.size == SuperBlockSize() && bp.skip)) {
+ return;
+ }
+ current_quantizer_index_ =
+ ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall,
+ frame_header_.delta_q.scale, kMinLossyQuantizer,
+ kMaxQuantizer, current_quantizer_index_);
+}
+
+void Tile::ReadLoopFilterDelta(const Block& block) {
+ assert(read_deltas_);
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.delta_lf.present ||
+ (block.size == SuperBlockSize() && bp.skip)) {
+ return;
+ }
+ int frame_lf_count = 1;
+ if (frame_header_.delta_lf.multi) {
+ frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2);
+ }
+ bool recompute_deblock_filter_levels = false;
+ for (int i = 0; i < frame_lf_count; ++i) {
+ uint16_t* const delta_lf_abs_cdf =
+ frame_header_.delta_lf.multi
+ ? symbol_decoder_context_.delta_lf_multi_cdf[i]
+ : symbol_decoder_context_.delta_lf_cdf;
+ const int8_t old_delta_lf = delta_lf_[i];
+ delta_lf_[i] = ReadAndClipDelta(
+ delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale,
+ -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]);
+ recompute_deblock_filter_levels =
+ recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]);
+ }
+ delta_lf_all_zero_ =
+ (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0;
+ if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) {
+ post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_);
+ }
+}
+
+void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) {
+ uint16_t* cdf;
+ if (intra_y_mode) {
+ const PredictionMode top_mode =
+ block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc;
+ const PredictionMode left_mode = block.left_available[kPlaneY]
+ ? block.bp_left->y_mode
+ : kPredictionModeDc;
+ const int top_context = kIntraYModeContext[top_mode];
+ const int left_context = kIntraYModeContext[left_mode];
+ cdf = symbol_decoder_context_
+ .intra_frame_y_mode_cdf[top_context][left_context];
+ } else {
+ cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]];
+ }
+ block.bp->y_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesY>(cdf));
+}
+
+void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.angle_delta[plane_type] = 0;
+ const PredictionMode mode = (plane_type == kPlaneTypeY)
+ ? bp.y_mode
+ : bp.prediction_parameters->uv_mode;
+ if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return;
+ uint16_t* const cdf =
+ symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical];
+ prediction_parameters.angle_delta[plane_type] =
+ reader_.ReadSymbol<kAngleDeltaSymbolCount>(cdf);
+ prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta;
+}
+
+void Tile::ReadCflAlpha(const Block& block) {
+ const int signs = reader_.ReadSymbol<kCflAlphaSignsSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_signs_cdf);
+ const int8_t* const cfl_lookup = kCflAlphaLookup[signs];
+ const auto sign_u = static_cast<CflSign>(cfl_lookup[0]);
+ const auto sign_v = static_cast<CflSign>(cfl_lookup[1]);
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.cfl_alpha_u = 0;
+ if (sign_u != kCflSignZero) {
+ assert(cfl_lookup[2] >= 0);
+ prediction_parameters.cfl_alpha_u =
+ reader_.ReadSymbol<kCflAlphaSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) +
+ 1;
+ if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1;
+ }
+ prediction_parameters.cfl_alpha_v = 0;
+ if (sign_v != kCflSignZero) {
+ assert(cfl_lookup[3] >= 0);
+ prediction_parameters.cfl_alpha_v =
+ reader_.ReadSymbol<kCflAlphaSymbolCount>(
+ symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) +
+ 1;
+ if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1;
+ }
+}
+
+void Tile::ReadPredictionModeUV(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bool chroma_from_luma_allowed;
+ if (frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id]) {
+ chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4;
+ } else {
+ chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size);
+ }
+ uint16_t* const cdf =
+ symbol_decoder_context_
+ .uv_mode_cdf[static_cast<int>(chroma_from_luma_allowed)][bp.y_mode];
+ if (chroma_from_luma_allowed) {
+ bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesUV>(cdf));
+ } else {
+ bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+ reader_.ReadSymbol<kIntraPredictionModesUV - 1>(cdf));
+ }
+}
+
+int Tile::ReadMotionVectorComponent(const Block& block, const int component) {
+ const int context =
+ static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+ const bool sign = reader_.ReadSymbol(
+ symbol_decoder_context_.mv_sign_cdf[component][context]);
+ const int mv_class = reader_.ReadSymbol<kMvClassSymbolCount>(
+ symbol_decoder_context_.mv_class_cdf[component][context]);
+ int magnitude = 1;
+ int value;
+ uint16_t* fraction_cdf;
+ uint16_t* precision_cdf;
+ if (mv_class == 0) {
+ value = static_cast<int>(reader_.ReadSymbol(
+ symbol_decoder_context_.mv_class0_bit_cdf[component][context]));
+ fraction_cdf = symbol_decoder_context_
+ .mv_class0_fraction_cdf[component][context][value];
+ precision_cdf = symbol_decoder_context_
+ .mv_class0_high_precision_cdf[component][context];
+ } else {
+ assert(mv_class <= kMvBitSymbolCount);
+ value = 0;
+ for (int i = 0; i < mv_class; ++i) {
+ const int bit = static_cast<int>(reader_.ReadSymbol(
+ symbol_decoder_context_.mv_bit_cdf[component][context][i]));
+ value |= bit << i;
+ }
+ magnitude += 2 << (mv_class + 2);
+ fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context];
+ precision_cdf =
+ symbol_decoder_context_.mv_high_precision_cdf[component][context];
+ }
+ const int fraction =
+ (frame_header_.force_integer_mv == 0)
+ ? reader_.ReadSymbol<kMvFractionSymbolCount>(fraction_cdf)
+ : 3;
+ const int precision =
+ frame_header_.allow_high_precision_mv
+ ? static_cast<int>(reader_.ReadSymbol(precision_cdf))
+ : 1;
+ magnitude += (value << 3) | (fraction << 1) | precision;
+ return sign ? -magnitude : magnitude;
+}
+
+void Tile::ReadMotionVector(const Block& block, int index) {
+ BlockParameters& bp = *block.bp;
+ const int context =
+ static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+ const auto mv_joint =
+ static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+ symbol_decoder_context_.mv_joint_cdf[context]));
+ if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
+ mv_joint == kMvJointTypeNonZero) {
+ bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
+ }
+ if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero ||
+ mv_joint == kMvJointTypeNonZero) {
+ bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1);
+ }
+}
+
+void Tile::ReadFilterIntraModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.use_filter_intra = false;
+ if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc ||
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] != 0 ||
+ !IsBlockDimensionLessThan64(block.size)) {
+ return;
+ }
+ prediction_parameters.use_filter_intra = reader_.ReadSymbol(
+ symbol_decoder_context_.use_filter_intra_cdf[block.size]);
+ if (prediction_parameters.use_filter_intra) {
+ prediction_parameters.filter_intra_mode = static_cast<FilterIntraPredictor>(
+ reader_.ReadSymbol<kNumFilterIntraPredictors>(
+ symbol_decoder_context_.filter_intra_mode_cdf));
+ }
+}
+
+bool Tile::DecodeIntraModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bp.skip = false;
+ if (frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadIntraSegmentId(block)) {
+ return false;
+ }
+ SetCdfContextSkipMode(block, false);
+ ReadSkip(block);
+ if (!frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadIntraSegmentId(block)) {
+ return false;
+ }
+ ReadCdef(block);
+ if (read_deltas_) {
+ ReadQuantizerIndexDelta(block);
+ ReadLoopFilterDelta(block);
+ read_deltas_ = false;
+ }
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.use_intra_block_copy = false;
+ if (frame_header_.allow_intrabc) {
+ prediction_parameters.use_intra_block_copy =
+ reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf);
+ }
+ if (prediction_parameters.use_intra_block_copy) {
+ bp.is_inter = true;
+ bp.reference_frame[0] = kReferenceFrameIntra;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ bp.y_mode = kPredictionModeDc;
+ bp.prediction_parameters->uv_mode = kPredictionModeDc;
+ SetCdfContextUVMode(block);
+ prediction_parameters.motion_mode = kMotionModeSimple;
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+ SetCdfContextPaletteSize(block);
+ bp.interpolation_filter[0] = kInterpolationFilterBilinear;
+ bp.interpolation_filter[1] = kInterpolationFilterBilinear;
+ MvContexts dummy_mode_contexts;
+ FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts);
+ return AssignIntraMv(block);
+ }
+ bp.is_inter = false;
+ return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true);
+}
+
+int8_t Tile::ComputePredictedSegmentId(const Block& block) const {
+ // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation
+ // map containing all 0s.
+ if (prev_segment_ids_ == nullptr) return 0;
+
+ const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4,
+ static_cast<int>(block.width4x4));
+ const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4,
+ static_cast<int>(block.height4x4));
+ int8_t id = 7;
+ for (int y = 0; y < y_limit; ++y) {
+ for (int x = 0; x < x_limit; ++x) {
+ const int8_t prev_segment_id =
+ prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x);
+ id = std::min(id, prev_segment_id);
+ }
+ }
+ return id;
+}
+
+void Tile::SetCdfContextUsePredictedSegmentId(const Block& block,
+ bool use_predicted_segment_id) {
+ memset(left_context_.use_predicted_segment_id + block.left_context_index,
+ static_cast<int>(use_predicted_segment_id), block.height4x4);
+ memset(block.top_context->use_predicted_segment_id + block.top_context_index,
+ static_cast<int>(use_predicted_segment_id), block.width4x4);
+}
+
+bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) {
+ BlockParameters& bp = *block.bp;
+ if (!frame_header_.segmentation.enabled) {
+ bp.prediction_parameters->segment_id = 0;
+ return true;
+ }
+ if (!frame_header_.segmentation.update_map) {
+ bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+ return true;
+ }
+ if (pre_skip) {
+ if (!frame_header_.segmentation.segment_id_pre_skip) {
+ bp.prediction_parameters->segment_id = 0;
+ return true;
+ }
+ } else if (bp.skip) {
+ SetCdfContextUsePredictedSegmentId(block, false);
+ return ReadSegmentId(block);
+ }
+ if (frame_header_.segmentation.temporal_update) {
+ const int context =
+ (block.left_available[kPlaneY]
+ ? static_cast<int>(
+ left_context_
+ .use_predicted_segment_id[block.left_context_index])
+ : 0) +
+ (block.top_available[kPlaneY]
+ ? static_cast<int>(
+ block.top_context
+ ->use_predicted_segment_id[block.top_context_index])
+ : 0);
+ const bool use_predicted_segment_id = reader_.ReadSymbol(
+ symbol_decoder_context_.use_predicted_segment_id_cdf[context]);
+ SetCdfContextUsePredictedSegmentId(block, use_predicted_segment_id);
+ if (use_predicted_segment_id) {
+ bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+ return true;
+ }
+ }
+ return ReadSegmentId(block);
+}
+
+void Tile::ReadIsInter(const Block& block, bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ if (skip_mode) {
+ bp.is_inter = true;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id,
+ kSegmentFeatureReferenceFrame)) {
+ bp.is_inter = frame_header_.segmentation
+ .feature_data[bp.prediction_parameters->segment_id]
+ [kSegmentFeatureReferenceFrame] !=
+ kReferenceFrameIntra;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+ bp.is_inter = true;
+ return;
+ }
+ int context = 0;
+ if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ context = (block.IsTopIntra() && block.IsLeftIntra())
+ ? 3
+ : static_cast<int>(block.IsTopIntra() || block.IsLeftIntra());
+ } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) {
+ context = 2 * static_cast<int>(block.top_available[kPlaneY]
+ ? block.IsTopIntra()
+ : block.IsLeftIntra());
+ }
+ bp.is_inter =
+ reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]);
+}
+
+void Tile::SetCdfContextPaletteSize(const Block& block) {
+ const PaletteModeInfo& palette_mode_info =
+ block.bp->prediction_parameters->palette_mode_info;
+ for (int plane_type = kPlaneTypeY; plane_type <= kPlaneTypeUV; ++plane_type) {
+ memset(left_context_.palette_size[plane_type] + block.left_context_index,
+ palette_mode_info.size[plane_type], block.height4x4);
+ memset(
+ block.top_context->palette_size[plane_type] + block.top_context_index,
+ palette_mode_info.size[plane_type], block.width4x4);
+ if (palette_mode_info.size[plane_type] == 0) continue;
+ for (int i = block.left_context_index;
+ i < block.left_context_index + block.height4x4; ++i) {
+ memcpy(left_context_.palette_color[i][plane_type],
+ palette_mode_info.color[plane_type],
+ kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+ }
+ for (int i = block.top_context_index;
+ i < block.top_context_index + block.width4x4; ++i) {
+ memcpy(block.top_context->palette_color[i][plane_type],
+ palette_mode_info.color[plane_type],
+ kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+ }
+ }
+}
+
+void Tile::SetCdfContextUVMode(const Block& block) {
+ // BlockCdfContext.uv_mode is only used to compute is_smooth_prediction for
+ // the intra edge upsamplers in the subsequent blocks. They have some special
+ // rules for subsampled UV planes. For subsampled UV planes, update left
+ // context only if current block contains the last odd column and update top
+ // context only if current block contains the last odd row.
+ if (subsampling_x_[kPlaneU] == 0 || (block.column4x4 & 1) == 1 ||
+ block.width4x4 > 1) {
+ memset(left_context_.uv_mode + block.left_context_index,
+ block.bp->prediction_parameters->uv_mode, block.height4x4);
+ }
+ if (subsampling_y_[kPlaneU] == 0 || (block.row4x4 & 1) == 1 ||
+ block.height4x4 > 1) {
+ memset(block.top_context->uv_mode + block.top_context_index,
+ block.bp->prediction_parameters->uv_mode, block.width4x4);
+ }
+}
+
+bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) {
+ BlockParameters& bp = *block.bp;
+ bp.reference_frame[0] = kReferenceFrameIntra;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ ReadPredictionModeY(block, intra_y_mode);
+ ReadIntraAngleInfo(block, kPlaneTypeY);
+ if (block.HasChroma()) {
+ ReadPredictionModeUV(block);
+ if (bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+ ReadCflAlpha(block);
+ }
+ if (block.left_available[kPlaneU]) {
+ const int smooth_row =
+ block.row4x4 + (~block.row4x4 & subsampling_y_[kPlaneU]);
+ const int smooth_column =
+ block.column4x4 - 1 - (block.column4x4 & subsampling_x_[kPlaneU]);
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(smooth_row, smooth_column);
+ bp.prediction_parameters->chroma_left_uses_smooth_prediction =
+ (bp_left.reference_frame[0] <= kReferenceFrameIntra) &&
+ kPredictionModeSmoothMask.Contains(
+ left_context_.uv_mode[CdfContextIndex(smooth_row)]);
+ }
+ if (block.top_available[kPlaneU]) {
+ const int smooth_row =
+ block.row4x4 - 1 - (block.row4x4 & subsampling_y_[kPlaneU]);
+ const int smooth_column =
+ block.column4x4 + (~block.column4x4 & subsampling_x_[kPlaneU]);
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(smooth_row, smooth_column);
+ bp.prediction_parameters->chroma_top_uses_smooth_prediction =
+ (bp_top.reference_frame[0] <= kReferenceFrameIntra) &&
+ kPredictionModeSmoothMask.Contains(
+ top_context_.get()[SuperBlockColumnIndex(smooth_column)]
+ .uv_mode[CdfContextIndex(smooth_column)]);
+ }
+ SetCdfContextUVMode(block);
+ ReadIntraAngleInfo(block, kPlaneTypeUV);
+ }
+ ReadPaletteModeInfo(block);
+ SetCdfContextPaletteSize(block);
+ ReadFilterIntraModeInfo(block);
+ return true;
+}
+
+CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) {
+ // compound and inter.
+ const bool top_comp_inter = block.top_available[kPlaneY] &&
+ !block.IsTopIntra() && !block.IsTopSingle();
+ const bool left_comp_inter = block.left_available[kPlaneY] &&
+ !block.IsLeftIntra() && !block.IsLeftSingle();
+ // unidirectional compound.
+ const bool top_uni_comp =
+ top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0),
+ block.TopReference(1));
+ const bool left_uni_comp =
+ left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0),
+ block.LeftReference(1));
+ int context;
+ if (block.top_available[kPlaneY] && !block.IsTopIntra() &&
+ block.left_available[kPlaneY] && !block.IsLeftIntra()) {
+ const int same_direction = static_cast<int>(IsSameDirectionReferencePair(
+ block.TopReference(0), block.LeftReference(0)));
+ if (!top_comp_inter && !left_comp_inter) {
+ context = 1 + MultiplyBy2(same_direction);
+ } else if (!top_comp_inter) {
+ context = left_uni_comp ? 3 + same_direction : 1;
+ } else if (!left_comp_inter) {
+ context = top_uni_comp ? 3 + same_direction : 1;
+ } else {
+ if (!top_uni_comp && !left_uni_comp) {
+ context = 0;
+ } else if (!top_uni_comp || !left_uni_comp) {
+ context = 2;
+ } else {
+ context = 3 + static_cast<int>(
+ (block.TopReference(0) == kReferenceFrameBackward) ==
+ (block.LeftReference(0) == kReferenceFrameBackward));
+ }
+ }
+ } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+ if (top_comp_inter) {
+ context = 1 + MultiplyBy2(static_cast<int>(top_uni_comp));
+ } else if (left_comp_inter) {
+ context = 1 + MultiplyBy2(static_cast<int>(left_uni_comp));
+ } else {
+ context = 2;
+ }
+ } else if (top_comp_inter) {
+ context = MultiplyBy4(static_cast<int>(top_uni_comp));
+ } else if (left_comp_inter) {
+ context = MultiplyBy4(static_cast<int>(left_uni_comp));
+ } else {
+ context = 2;
+ }
+ return static_cast<CompoundReferenceType>(reader_.ReadSymbol(
+ symbol_decoder_context_.compound_reference_type_cdf[context]));
+}
+
+template <bool is_single, bool is_backward, int index>
+uint16_t* Tile::GetReferenceCdf(
+ const Block& block,
+ CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) {
+ int context = 0;
+ if ((type == kCompoundReferenceUnidirectional && index == 0) ||
+ (is_single && index == 1)) {
+ // uni_comp_ref and single_ref_p1.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden,
+ kReferenceFrameBackward, kReferenceFrameAlternate);
+ } else if (type == kCompoundReferenceUnidirectional && index == 1) {
+ // uni_comp_ref_p1.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2,
+ kReferenceFrameLast3, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceUnidirectional && index == 2) ||
+ (type == kCompoundReferenceBidirectional && index == 2) ||
+ (is_single && index == 5)) {
+ // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3,
+ kReferenceFrameGolden, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceBidirectional && index == 0) ||
+ (is_single && index == 3)) {
+ // comp_ref and single_ref_p3.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2,
+ kReferenceFrameLast3, kReferenceFrameGolden);
+ } else if ((type == kCompoundReferenceBidirectional && index == 1) ||
+ (is_single && index == 4)) {
+ // comp_ref_p1 and single_ref_p4.
+ context =
+ GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast,
+ kReferenceFrameLast2, kReferenceFrameLast2);
+ } else if ((is_single && index == 2) || (is_backward && index == 0)) {
+ // single_ref_p2 and comp_bwdref.
+ context = GetReferenceContext(
+ block, kReferenceFrameBackward, kReferenceFrameAlternate2,
+ kReferenceFrameAlternate, kReferenceFrameAlternate);
+ } else if ((is_single && index == 6) || (is_backward && index == 1)) {
+ // single_ref_p6 and comp_bwdref_p1.
+ context = GetReferenceContext(
+ block, kReferenceFrameBackward, kReferenceFrameBackward,
+ kReferenceFrameAlternate2, kReferenceFrameAlternate2);
+ }
+ // When using GCC 12.x for some targets the compiler reports a false positive
+ // with the context subscript when is_single=false, is_backward=false and
+ // index=0. GetReferenceContext() can only return values between 0 and 2.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+ assert(context >= 0 && context <= 2);
+ if (is_single) {
+ // The index parameter for single references is offset by one since the spec
+ // uses 1-based index for these elements.
+ return symbol_decoder_context_.single_reference_cdf[context][index - 1];
+ }
+ if (is_backward) {
+ return symbol_decoder_context_
+ .compound_backward_reference_cdf[context][index];
+ }
+ return symbol_decoder_context_.compound_reference_cdf[type][context][index];
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+}
+
+void Tile::ReadReferenceFrames(const Block& block, bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ if (skip_mode) {
+ bp.reference_frame[0] = frame_header_.skip_mode_frame[0];
+ bp.reference_frame[1] = frame_header_.skip_mode_frame[1];
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id,
+ kSegmentFeatureReferenceFrame)) {
+ bp.reference_frame[0] = static_cast<ReferenceFrameType>(
+ frame_header_.segmentation
+ .feature_data[bp.prediction_parameters->segment_id]
+ [kSegmentFeatureReferenceFrame]);
+ bp.reference_frame[1] = kReferenceFrameNone;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameNone;
+ return;
+ }
+ const bool use_compound_reference =
+ frame_header_.reference_mode_select &&
+ std::min(block.width4x4, block.height4x4) >= 2 &&
+ reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf
+ [GetUseCompoundReferenceContext(block)]);
+ if (use_compound_reference) {
+ CompoundReferenceType reference_type = ReadCompoundReferenceType(block);
+ if (reference_type == kCompoundReferenceUnidirectional) {
+ // uni_comp_ref.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 0>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameBackward;
+ bp.reference_frame[1] = kReferenceFrameAlternate;
+ return;
+ }
+ // uni_comp_ref_p1.
+ if (!reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 1>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameLast2;
+ return;
+ }
+ // uni_comp_ref_p2.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 2>(block, reference_type))) {
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameGolden;
+ return;
+ }
+ bp.reference_frame[0] = kReferenceFrameLast;
+ bp.reference_frame[1] = kReferenceFrameLast3;
+ return;
+ }
+ assert(reference_type == kCompoundReferenceBidirectional);
+ // comp_ref.
+ if (reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 0>(block, reference_type))) {
+ // comp_ref_p2.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 2>(block, reference_type))
+ ? kReferenceFrameGolden
+ : kReferenceFrameLast3;
+ } else {
+ // comp_ref_p1.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(
+ GetReferenceCdf<false, false, 1>(block, reference_type))
+ ? kReferenceFrameLast2
+ : kReferenceFrameLast;
+ }
+ // comp_bwdref.
+ if (reader_.ReadSymbol(GetReferenceCdf<false, true, 0>(block))) {
+ bp.reference_frame[1] = kReferenceFrameAlternate;
+ } else {
+ // comp_bwdref_p1.
+ bp.reference_frame[1] =
+ reader_.ReadSymbol(GetReferenceCdf<false, true, 1>(block))
+ ? kReferenceFrameAlternate2
+ : kReferenceFrameBackward;
+ }
+ return;
+ }
+ assert(!use_compound_reference);
+ bp.reference_frame[1] = kReferenceFrameNone;
+ // single_ref_p1.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 1>(block))) {
+ // single_ref_p2.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 2>(block))) {
+ bp.reference_frame[0] = kReferenceFrameAlternate;
+ return;
+ }
+ // single_ref_p6.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 6>(block))
+ ? kReferenceFrameAlternate2
+ : kReferenceFrameBackward;
+ return;
+ }
+ // single_ref_p3.
+ if (reader_.ReadSymbol(GetReferenceCdf<true, false, 3>(block))) {
+ // single_ref_p5.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 5>(block))
+ ? kReferenceFrameGolden
+ : kReferenceFrameLast3;
+ return;
+ }
+ // single_ref_p4.
+ bp.reference_frame[0] =
+ reader_.ReadSymbol(GetReferenceCdf<true, false, 4>(block))
+ ? kReferenceFrameLast2
+ : kReferenceFrameLast;
+}
+
+void Tile::ReadInterPredictionModeY(const Block& block,
+ const MvContexts& mode_contexts,
+ bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ if (skip_mode) {
+ bp.y_mode = kPredictionModeNearestNearestMv;
+ return;
+ }
+ if (frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+ frame_header_.segmentation.FeatureActive(
+ bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+ bp.y_mode = kPredictionModeGlobalMv;
+ return;
+ }
+ if (bp.reference_frame[1] > kReferenceFrameIntra) {
+ const int idx0 = mode_contexts.reference_mv >> 1;
+ const int idx1 =
+ std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1);
+ const int context = kCompoundModeContextMap[idx0][idx1];
+ const int offset = reader_.ReadSymbol<kNumCompoundInterPredictionModes>(
+ symbol_decoder_context_.compound_prediction_mode_cdf[context]);
+ bp.y_mode =
+ static_cast<PredictionMode>(kPredictionModeNearestNearestMv + offset);
+ return;
+ }
+ // new_mv.
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) {
+ bp.y_mode = kPredictionModeNewMv;
+ return;
+ }
+ // zero_mv.
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) {
+ bp.y_mode = kPredictionModeGlobalMv;
+ return;
+ }
+ // ref_mv.
+ bp.y_mode =
+ reader_.ReadSymbol(
+ symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv])
+ ? kPredictionModeNearMv
+ : kPredictionModeNearestMv;
+}
+
+void Tile::ReadRefMvIndex(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.ref_mv_index = 0;
+ if (bp.y_mode != kPredictionModeNewMv &&
+ bp.y_mode != kPredictionModeNewNewMv &&
+ !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) {
+ return;
+ }
+ const int start =
+ static_cast<int>(kPredictionModeHasNearMvMask.Contains(bp.y_mode));
+ prediction_parameters.ref_mv_index = start;
+ for (int i = start; i < start + 2; ++i) {
+ if (prediction_parameters.ref_mv_count <= i + 1) break;
+ // drl_mode in the spec.
+ const bool ref_mv_index_bit = reader_.ReadSymbol(
+ symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext(
+ prediction_parameters.nearest_mv_count, i)]);
+ prediction_parameters.ref_mv_index = i + static_cast<int>(ref_mv_index_bit);
+ if (!ref_mv_index_bit) return;
+ }
+}
+
+void Tile::ReadInterIntraMode(const Block& block, bool is_compound,
+ bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+ prediction_parameters.is_wedge_inter_intra = false;
+ if (skip_mode || !sequence_header_.enable_interintra_compound ||
+ is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) {
+ return;
+ }
+ // kSizeGroup[block.size] is guaranteed to be non-zero because of the block
+ // size constraint enforced in the above condition.
+ assert(kSizeGroup[block.size] - 1 >= 0);
+ if (!reader_.ReadSymbol(
+ symbol_decoder_context_
+ .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) {
+ prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+ return;
+ }
+ prediction_parameters.inter_intra_mode =
+ static_cast<InterIntraMode>(reader_.ReadSymbol<kNumInterIntraModes>(
+ symbol_decoder_context_
+ .inter_intra_mode_cdf[kSizeGroup[block.size] - 1]));
+ bp.reference_frame[1] = kReferenceFrameIntra;
+ prediction_parameters.angle_delta[kPlaneTypeY] = 0;
+ prediction_parameters.angle_delta[kPlaneTypeUV] = 0;
+ prediction_parameters.use_filter_intra = false;
+ prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol(
+ symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]);
+ if (!prediction_parameters.is_wedge_inter_intra) return;
+ prediction_parameters.wedge_index =
+ reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+ symbol_decoder_context_.wedge_index_cdf[block.size]);
+ prediction_parameters.wedge_sign = 0;
+}
+
+void Tile::ReadMotionMode(const Block& block, bool is_compound,
+ bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ const auto global_motion_type =
+ frame_header_.global_motion[bp.reference_frame[0]].type;
+ if (skip_mode || !frame_header_.is_motion_mode_switchable ||
+ IsBlockDimension4(block.size) ||
+ (frame_header_.force_integer_mv == 0 &&
+ (bp.y_mode == kPredictionModeGlobalMv ||
+ bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+ global_motion_type > kGlobalMotionTransformationTypeTranslation) ||
+ is_compound || bp.reference_frame[1] == kReferenceFrameIntra ||
+ !block.HasOverlappableCandidates()) {
+ prediction_parameters.motion_mode = kMotionModeSimple;
+ return;
+ }
+ prediction_parameters.num_warp_samples = 0;
+ int num_samples_scanned = 0;
+ memset(prediction_parameters.warp_estimate_candidates, 0,
+ sizeof(prediction_parameters.warp_estimate_candidates));
+ FindWarpSamples(block, &prediction_parameters.num_warp_samples,
+ &num_samples_scanned,
+ prediction_parameters.warp_estimate_candidates);
+ if (frame_header_.force_integer_mv != 0 ||
+ prediction_parameters.num_warp_samples == 0 ||
+ !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) {
+ prediction_parameters.motion_mode =
+ reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size])
+ ? kMotionModeObmc
+ : kMotionModeSimple;
+ return;
+ }
+ prediction_parameters.motion_mode =
+ static_cast<MotionMode>(reader_.ReadSymbol<kNumMotionModes>(
+ symbol_decoder_context_.motion_mode_cdf[block.size]));
+}
+
+uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
+ int context = 0;
+ if (block.top_available[kPlaneY]) {
+ if (!block.IsTopSingle()) {
+ context += static_cast<int>(
+ block.top_context
+ ->is_explicit_compound_type[block.top_context_index]);
+ } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+ context += 3;
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ if (!block.IsLeftSingle()) {
+ context += static_cast<int>(
+ left_context_.is_explicit_compound_type[block.left_context_index]);
+ } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+ context += 3;
+ }
+ }
+ return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min(
+ context, kIsExplicitCompoundTypeContexts - 1)];
+}
+
+uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
+ const BlockParameters& bp = *block.bp;
+ const ReferenceInfo& reference_info = *current_frame_.reference_info();
+ const int forward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+ const int backward =
+ std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
+ int context = (forward == backward) ? 3 : 0;
+ if (block.top_available[kPlaneY]) {
+ if (!block.IsTopSingle()) {
+ context += static_cast<int>(
+ block.top_context->is_compound_type_average[block.top_context_index]);
+ } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+ ++context;
+ }
+ }
+ if (block.left_available[kPlaneY]) {
+ if (!block.IsLeftSingle()) {
+ context += static_cast<int>(
+ left_context_.is_compound_type_average[block.left_context_index]);
+ } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+ ++context;
+ }
+ }
+ return symbol_decoder_context_.is_compound_type_average_cdf[context];
+}
+
+void Tile::ReadCompoundType(const Block& block, bool is_compound,
+ bool skip_mode,
+ bool* const is_explicit_compound_type,
+ bool* const is_compound_type_average) {
+ *is_explicit_compound_type = false;
+ *is_compound_type_average = true;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ if (skip_mode) {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ return;
+ }
+ if (is_compound) {
+ if (sequence_header_.enable_masked_compound) {
+ *is_explicit_compound_type =
+ reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block));
+ }
+ if (*is_explicit_compound_type) {
+ if (kIsWedgeCompoundModeAllowed.Contains(block.size)) {
+ // Only kCompoundPredictionTypeWedge and
+ // kCompoundPredictionTypeDiffWeighted are signaled explicitly.
+ prediction_parameters.compound_prediction_type =
+ static_cast<CompoundPredictionType>(reader_.ReadSymbol(
+ symbol_decoder_context_.compound_type_cdf[block.size]));
+ } else {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeDiffWeighted;
+ }
+ } else {
+ if (sequence_header_.enable_jnt_comp) {
+ *is_compound_type_average =
+ reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block));
+ prediction_parameters.compound_prediction_type =
+ *is_compound_type_average ? kCompoundPredictionTypeAverage
+ : kCompoundPredictionTypeDistance;
+ } else {
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+ return;
+ }
+ }
+ if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge) {
+ prediction_parameters.wedge_index =
+ reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+ symbol_decoder_context_.wedge_index_cdf[block.size]);
+ prediction_parameters.wedge_sign = static_cast<int>(reader_.ReadBit());
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeDiffWeighted) {
+ prediction_parameters.mask_is_inverse = reader_.ReadBit() != 0;
+ }
+ return;
+ }
+ if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) {
+ prediction_parameters.compound_prediction_type =
+ prediction_parameters.is_wedge_inter_intra
+ ? kCompoundPredictionTypeWedge
+ : kCompoundPredictionTypeIntra;
+ return;
+ }
+ prediction_parameters.compound_prediction_type =
+ kCompoundPredictionTypeAverage;
+}
+
+uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) {
+ const BlockParameters& bp = *block.bp;
+ int context = MultiplyBy8(direction) +
+ MultiplyBy4(static_cast<int>(bp.reference_frame[1] >
+ kReferenceFrameIntra));
+ int top_type = kNumExplicitInterpolationFilters;
+ if (block.top_available[kPlaneY]) {
+ if (block.bp_top->reference_frame[0] == bp.reference_frame[0] ||
+ block.bp_top->reference_frame[1] == bp.reference_frame[0]) {
+ top_type = block.bp_top->interpolation_filter[direction];
+ }
+ }
+ int left_type = kNumExplicitInterpolationFilters;
+ if (block.left_available[kPlaneY]) {
+ if (block.bp_left->reference_frame[0] == bp.reference_frame[0] ||
+ block.bp_left->reference_frame[1] == bp.reference_frame[0]) {
+ left_type = block.bp_left->interpolation_filter[direction];
+ }
+ }
+ if (left_type == top_type) {
+ context += left_type;
+ } else if (left_type == kNumExplicitInterpolationFilters) {
+ context += top_type;
+ } else if (top_type == kNumExplicitInterpolationFilters) {
+ context += left_type;
+ } else {
+ context += kNumExplicitInterpolationFilters;
+ }
+ return symbol_decoder_context_.interpolation_filter_cdf[context];
+}
+
+void Tile::ReadInterpolationFilter(const Block& block, bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) {
+ static_assert(
+ sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) ==
+ 2,
+ "Interpolation filter array size is not 2");
+ for (auto& interpolation_filter : bp.interpolation_filter) {
+ interpolation_filter = frame_header_.interpolation_filter;
+ }
+ return;
+ }
+ bool interpolation_filter_present = true;
+ if (skip_mode ||
+ block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) {
+ interpolation_filter_present = false;
+ } else if (!IsBlockDimension4(block.size) &&
+ bp.y_mode == kPredictionModeGlobalMv) {
+ interpolation_filter_present =
+ frame_header_.global_motion[bp.reference_frame[0]].type ==
+ kGlobalMotionTransformationTypeTranslation;
+ } else if (!IsBlockDimension4(block.size) &&
+ bp.y_mode == kPredictionModeGlobalGlobalMv) {
+ interpolation_filter_present =
+ frame_header_.global_motion[bp.reference_frame[0]].type ==
+ kGlobalMotionTransformationTypeTranslation ||
+ frame_header_.global_motion[bp.reference_frame[1]].type ==
+ kGlobalMotionTransformationTypeTranslation;
+ }
+ for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) {
+ bp.interpolation_filter[i] =
+ interpolation_filter_present
+ ? static_cast<InterpolationFilter>(
+ reader_.ReadSymbol<kNumExplicitInterpolationFilters>(
+ GetInterpolationFilterCdf(block, i)))
+ : kInterpolationFilterEightTap;
+ }
+ if (!sequence_header_.enable_dual_filter) {
+ bp.interpolation_filter[1] = bp.interpolation_filter[0];
+ }
+}
+
+void Tile::SetCdfContextCompoundType(const Block& block,
+ bool is_explicit_compound_type,
+ bool is_compound_type_average) {
+ memset(left_context_.is_explicit_compound_type + block.left_context_index,
+ static_cast<int>(is_explicit_compound_type), block.height4x4);
+ memset(left_context_.is_compound_type_average + block.left_context_index,
+ static_cast<int>(is_compound_type_average), block.height4x4);
+ memset(block.top_context->is_explicit_compound_type + block.top_context_index,
+ static_cast<int>(is_explicit_compound_type), block.width4x4);
+ memset(block.top_context->is_compound_type_average + block.top_context_index,
+ static_cast<int>(is_compound_type_average), block.width4x4);
+}
+
+bool Tile::ReadInterBlockModeInfo(const Block& block, bool skip_mode) {
+ BlockParameters& bp = *block.bp;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+ SetCdfContextPaletteSize(block);
+ ReadReferenceFrames(block, skip_mode);
+ const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra;
+ MvContexts mode_contexts;
+ FindMvStack(block, is_compound, &mode_contexts);
+ ReadInterPredictionModeY(block, mode_contexts, skip_mode);
+ ReadRefMvIndex(block);
+ if (!AssignInterMv(block, is_compound)) return false;
+ ReadInterIntraMode(block, is_compound, skip_mode);
+ ReadMotionMode(block, is_compound, skip_mode);
+ bool is_explicit_compound_type;
+ bool is_compound_type_average;
+ ReadCompoundType(block, is_compound, skip_mode, &is_explicit_compound_type,
+ &is_compound_type_average);
+ SetCdfContextCompoundType(block, is_explicit_compound_type,
+ is_compound_type_average);
+ ReadInterpolationFilter(block, skip_mode);
+ return true;
+}
+
+void Tile::SetCdfContextSkipMode(const Block& block, bool skip_mode) {
+ memset(left_context_.skip_mode + block.left_context_index,
+ static_cast<int>(skip_mode), block.height4x4);
+ memset(block.top_context->skip_mode + block.top_context_index,
+ static_cast<int>(skip_mode), block.width4x4);
+}
+
+bool Tile::DecodeInterModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ block.bp->prediction_parameters->use_intra_block_copy = false;
+ bp.skip = false;
+ if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false;
+ bool skip_mode = ReadSkipMode(block);
+ SetCdfContextSkipMode(block, skip_mode);
+ if (skip_mode) {
+ bp.skip = true;
+ } else {
+ ReadSkip(block);
+ }
+ if (!frame_header_.segmentation.segment_id_pre_skip &&
+ !ReadInterSegmentId(block, /*pre_skip=*/false)) {
+ return false;
+ }
+ ReadCdef(block);
+ if (read_deltas_) {
+ ReadQuantizerIndexDelta(block);
+ ReadLoopFilterDelta(block);
+ read_deltas_ = false;
+ }
+ ReadIsInter(block, skip_mode);
+ return bp.is_inter ? ReadInterBlockModeInfo(block, skip_mode)
+ : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false);
+}
+
+bool Tile::DecodeModeInfo(const Block& block) {
+ return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block)
+ : DecodeInterModeInfo(block);
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+int Tile::GetPaletteCache(const Block& block, PlaneType plane_type,
+ uint16_t* const cache) {
+ const int top_size =
+ (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0)
+ ? block.top_context->palette_size[plane_type][block.top_context_index]
+ : 0;
+ const int left_size =
+ block.left_available[kPlaneY]
+ ? left_context_.palette_size[plane_type][block.left_context_index]
+ : 0;
+ if (left_size == 0 && top_size == 0) return 0;
+ // Merge the left and top colors in sorted order and store them in |cache|.
+ uint16_t empty_palette[1];
+ const uint16_t* top =
+ (top_size > 0) ? block.top_context
+ ->palette_color[block.top_context_index][plane_type]
+ : empty_palette;
+ const uint16_t* left =
+ (left_size > 0)
+ ? left_context_.palette_color[block.left_context_index][plane_type]
+ : empty_palette;
+ std::merge(top, top + top_size, left, left + left_size, cache);
+ // Deduplicate the entries in |cache| and return the number of unique
+ // entries.
+ return static_cast<int>(
+ std::distance(cache, std::unique(cache, cache + left_size + top_size)));
+}
+
+void Tile::ReadPaletteColors(const Block& block, Plane plane) {
+ const PlaneType plane_type = GetPlaneType(plane);
+ uint16_t cache[2 * kMaxPaletteSize];
+ const int n = GetPaletteCache(block, plane_type, cache);
+ BlockParameters& bp = *block.bp;
+ const uint8_t palette_size =
+ bp.prediction_parameters->palette_mode_info.size[plane_type];
+ uint16_t* const palette_color =
+ bp.prediction_parameters->palette_mode_info.color[plane];
+ const int8_t bitdepth = sequence_header_.color_config.bitdepth;
+ int index = 0;
+ for (int i = 0; i < n && index < palette_size; ++i) {
+ if (reader_.ReadBit() != 0) { // use_palette_color_cache.
+ palette_color[index++] = cache[i];
+ }
+ }
+ const int merge_pivot = index;
+ if (index < palette_size) {
+ palette_color[index++] =
+ static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+ }
+ const int max_value = (1 << bitdepth) - 1;
+ if (index < palette_size) {
+ int bits = bitdepth - 3 + static_cast<int>(reader_.ReadLiteral(2));
+ do {
+ const int delta = static_cast<int>(reader_.ReadLiteral(bits)) +
+ (plane_type == kPlaneTypeY ? 1 : 0);
+ palette_color[index] =
+ std::min(palette_color[index - 1] + delta, max_value);
+ if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >=
+ max_value) {
+ // Once the color exceeds max_value, all others can be set to max_value
+ // (since they are computed as a delta on top of the current color and
+ // then clipped).
+ Memset(&palette_color[index + 1], max_value, palette_size - index - 1);
+ break;
+ }
+ const int range = (1 << bitdepth) - palette_color[index] -
+ (plane_type == kPlaneTypeY ? 1 : 0);
+ bits = std::min(bits, CeilLog2(range));
+ } while (++index < palette_size);
+ }
+ // Palette colors are generated using two ascending arrays. So sorting them is
+ // simply a matter of merging the two sorted portions of the array.
+ std::inplace_merge(palette_color, palette_color + merge_pivot,
+ palette_color + palette_size);
+ if (plane_type == kPlaneTypeUV) {
+ uint16_t* const palette_color_v =
+ bp.prediction_parameters->palette_mode_info.color[kPlaneV];
+ if (reader_.ReadBit() != 0) { // delta_encode_palette_colors_v.
+ const int bits = bitdepth - 4 + static_cast<int>(reader_.ReadLiteral(2));
+ palette_color_v[0] = reader_.ReadLiteral(bitdepth);
+ for (int i = 1; i < palette_size; ++i) {
+ int delta = static_cast<int>(reader_.ReadLiteral(bits));
+ if (delta != 0 && reader_.ReadBit() != 0) delta = -delta;
+ // This line is equivalent to the following lines in the spec:
+ // val = palette_colors_v[ idx - 1 ] + palette_delta_v
+ // if ( val < 0 ) val += maxVal
+ // if ( val >= maxVal ) val -= maxVal
+ // palette_colors_v[ idx ] = Clip1( val )
+ //
+ // The difference is that in the code, max_value is (1 << bitdepth) - 1.
+ // So "& max_value" has the desired effect of computing both the "if"
+ // conditions and the Clip.
+ palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value;
+ }
+ } else {
+ for (int i = 0; i < palette_size; ++i) {
+ palette_color_v[i] =
+ static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+ }
+ }
+ }
+}
+
+void Tile::ReadPaletteModeInfo(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+ if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
+ !frame_header_.allow_screen_content_tools) {
+ return;
+ }
+ const int block_size_context =
+ k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2;
+ if (bp.y_mode == kPredictionModeDc) {
+ const int context =
+ static_cast<int>(
+ block.top_available[kPlaneY] &&
+ block.top_context
+ ->palette_size[kPlaneTypeY][block.top_context_index] > 0) +
+ static_cast<int>(
+ block.left_available[kPlaneY] &&
+ left_context_.palette_size[kPlaneTypeY][block.left_context_index] >
+ 0);
+ const bool has_palette_y = reader_.ReadSymbol(
+ symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]);
+ if (has_palette_y) {
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] =
+ kMinPaletteSize +
+ reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+ symbol_decoder_context_.palette_y_size_cdf[block_size_context]);
+ ReadPaletteColors(block, kPlaneY);
+ }
+ }
+ if (block.HasChroma() &&
+ bp.prediction_parameters->uv_mode == kPredictionModeDc) {
+ const int context = static_cast<int>(
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] > 0);
+ const bool has_palette_uv =
+ reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]);
+ if (has_palette_uv) {
+ bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] =
+ kMinPaletteSize +
+ reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+ symbol_decoder_context_.palette_uv_size_cdf[block_size_context]);
+ ReadPaletteColors(block, kPlaneU);
+ }
+ }
+}
+
+void Tile::PopulatePaletteColorContexts(
+ const Block& block, PlaneType plane_type, int i, int start, int end,
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+ uint8_t color_context[kMaxPaletteSquare]) {
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int column = start, counter = 0; column >= end; --column, ++counter) {
+ const int row = i - column;
+ assert(row > 0 || column > 0);
+ const uint8_t top =
+ (row > 0)
+ ? prediction_parameters.color_index_map[plane_type][row - 1][column]
+ : 0;
+ const uint8_t left =
+ (column > 0)
+ ? prediction_parameters.color_index_map[plane_type][row][column - 1]
+ : 0;
+ uint8_t index_mask;
+ static_assert(kMaxPaletteSize <= 8, "");
+ int index;
+ if (column <= 0) {
+ color_context[counter] = 0;
+ color_order[counter][0] = top;
+ index_mask = 1 << top;
+ index = 1;
+ } else if (row <= 0) {
+ color_context[counter] = 0;
+ color_order[counter][0] = left;
+ index_mask = 1 << left;
+ index = 1;
+ } else {
+ const uint8_t top_left =
+ prediction_parameters
+ .color_index_map[plane_type][row - 1][column - 1];
+ index_mask = (1 << top) | (1 << left) | (1 << top_left);
+ if (top == left && top == top_left) {
+ color_context[counter] = 4;
+ color_order[counter][0] = top;
+ index = 1;
+ } else if (top == left) {
+ color_context[counter] = 3;
+ color_order[counter][0] = top;
+ color_order[counter][1] = top_left;
+ index = 2;
+ } else if (top == top_left) {
+ color_context[counter] = 2;
+ color_order[counter][0] = top_left;
+ color_order[counter][1] = left;
+ index = 2;
+ } else if (left == top_left) {
+ color_context[counter] = 2;
+ color_order[counter][0] = top_left;
+ color_order[counter][1] = top;
+ index = 2;
+ } else {
+ color_context[counter] = 1;
+ color_order[counter][0] = std::min(top, left);
+ color_order[counter][1] = std::max(top, left);
+ color_order[counter][2] = top_left;
+ index = 3;
+ }
+ }
+ // Even though only the first |palette_size| entries of this array are ever
+ // used, it is faster to populate all 8 because of the vectorization of the
+ // constant sized loop.
+ for (uint8_t j = 0; j < kMaxPaletteSize; ++j) {
+ if (BitMaskSet::MaskContainsValue(index_mask, j)) continue;
+ color_order[counter][index++] = j;
+ }
+ }
+}
+
+bool Tile::ReadPaletteTokens(const Block& block) {
+ const PaletteModeInfo& palette_mode_info =
+ block.bp->prediction_parameters->palette_mode_info;
+ PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ for (int plane_type = kPlaneTypeY;
+ plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV);
+ ++plane_type) {
+ const int palette_size = palette_mode_info.size[plane_type];
+ if (palette_size == 0) continue;
+ int block_height = block.height;
+ int block_width = block.width;
+ int screen_height = std::min(
+ block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4));
+ int screen_width = std::min(
+ block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4));
+ if (plane_type == kPlaneTypeUV) {
+ block_height >>= sequence_header_.color_config.subsampling_y;
+ block_width >>= sequence_header_.color_config.subsampling_x;
+ screen_height >>= sequence_header_.color_config.subsampling_y;
+ screen_width >>= sequence_header_.color_config.subsampling_x;
+ if (block_height < 4) {
+ block_height += 2;
+ screen_height += 2;
+ }
+ if (block_width < 4) {
+ block_width += 2;
+ screen_width += 2;
+ }
+ }
+ if (!prediction_parameters.color_index_map[plane_type].Reset(
+ block_height, block_width, /*zero_initialize=*/false)) {
+ return false;
+ }
+ int first_value = 0;
+ reader_.DecodeUniform(palette_size, &first_value);
+ prediction_parameters.color_index_map[plane_type][0][0] = first_value;
+ for (int i = 1; i < screen_height + screen_width - 1; ++i) {
+ const int start = std::min(i, screen_width - 1);
+ const int end = std::max(0, i - screen_height + 1);
+ uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize];
+ uint8_t color_context[kMaxPaletteSquare];
+ PopulatePaletteColorContexts(block, static_cast<PlaneType>(plane_type), i,
+ start, end, color_order, color_context);
+ for (int j = start, counter = 0; j >= end; --j, ++counter) {
+ uint16_t* const cdf =
+ symbol_decoder_context_
+ .palette_color_index_cdf[plane_type]
+ [palette_size - kMinPaletteSize]
+ [color_context[counter]];
+ const int color_order_index = reader_.ReadSymbol(cdf, palette_size);
+ prediction_parameters.color_index_map[plane_type][i - j][j] =
+ color_order[counter][color_order_index];
+ }
+ }
+ if (screen_width < block_width) {
+ for (int i = 0; i < screen_height; ++i) {
+ memset(
+ &prediction_parameters.color_index_map[plane_type][i][screen_width],
+ prediction_parameters
+ .color_index_map[plane_type][i][screen_width - 1],
+ block_width - screen_width);
+ }
+ }
+ for (int i = screen_height; i < block_height; ++i) {
+ memcpy(
+ prediction_parameters.color_index_map[plane_type][i],
+ prediction_parameters.color_index_map[plane_type][screen_height - 1],
+ block_width);
+ }
+ }
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf,
+ BlockSize block_size) {
+ // The spec computes the cdf value using the following formula (not writing
+ // partition_cdf[] and using short forms for partition names for clarity):
+ // cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS;
+ // if (block_size != 128x128) {
+ // cdf += VRS - H4;
+ // }
+ // After canceling out the repeated terms with opposite signs, we have:
+ // cdf = None - H + V - VLS;
+ // if (block_size != 128x128) {
+ // cdf += VRS - H4;
+ // }
+ uint16_t cdf = partition_cdf[kPartitionNone] -
+ partition_cdf[kPartitionHorizontal] +
+ partition_cdf[kPartitionVertical] -
+ partition_cdf[kPartitionVerticalWithLeftSplit];
+ if (block_size != kBlock128x128) {
+ cdf += partition_cdf[kPartitionVerticalWithRightSplit] -
+ partition_cdf[kPartitionHorizontal4];
+ }
+ return cdf;
+}
+
+uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf,
+ BlockSize block_size) {
+ // The spec computes the cdf value using the following formula (not writing
+ // partition_cdf[] and using short forms for partition names for clarity):
+ // cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4 - V4;
+ // }
+ // V4 is always zero. So, after canceling out the repeated terms with opposite
+ // signs, we have:
+ // cdf = H + HBS - VRS - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4;
+ // }
+ // VRS is zero for 128x128 blocks. So, further simplifying we have:
+ // cdf = H + HBS - HTS;
+ // if (block_size != 128x128) {
+ // cdf += H4 - VRS;
+ // }
+ uint16_t cdf = partition_cdf[kPartitionHorizontal] +
+ partition_cdf[kPartitionHorizontalWithBottomSplit] -
+ partition_cdf[kPartitionHorizontalWithTopSplit];
+ if (block_size != kBlock128x128) {
+ cdf += partition_cdf[kPartitionHorizontal4] -
+ partition_cdf[kPartitionVerticalWithRightSplit];
+ }
+ return cdf;
+}
+
+} // namespace
+
+uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4,
+ BlockSize block_size) {
+ const int block_size_log2 = k4x4WidthLog2[block_size];
+ int top = 0;
+ if (IsTopInside(row4x4)) {
+ top = static_cast<int>(
+ k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4)
+ ->size] < block_size_log2);
+ }
+ int left = 0;
+ if (IsLeftInside(column4x4)) {
+ left = static_cast<int>(
+ k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1)
+ ->size] < block_size_log2);
+ }
+ const int context = left * 2 + top;
+ return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context];
+}
+
+bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+ bool has_rows, bool has_columns,
+ Partition* const partition) {
+ if (IsBlockSmallerThan8x8(block_size)) {
+ *partition = kPartitionNone;
+ return true;
+ }
+ if (!has_rows && !has_columns) {
+ *partition = kPartitionSplit;
+ return true;
+ }
+ uint16_t* const partition_cdf =
+ GetPartitionCdf(row4x4, column4x4, block_size);
+ if (partition_cdf == nullptr) {
+ return false;
+ }
+ if (has_rows && has_columns) {
+ const int bsize_log2 = k4x4WidthLog2[block_size];
+ // The partition block size should be 8x8 or above.
+ assert(bsize_log2 > 0);
+ if (bsize_log2 == 1) {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kPartitionSplit + 1>(partition_cdf));
+ } else if (bsize_log2 == 5) {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kPartitionVerticalWithRightSplit + 1>(
+ partition_cdf));
+ } else {
+ *partition = static_cast<Partition>(
+ reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
+ }
+ } else if (has_columns) {
+ const uint16_t cdf =
+ PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
+ *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+ : kPartitionHorizontal;
+ } else {
+ const uint16_t cdf =
+ PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
+ *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+ : kPartitionVertical;
+ }
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kMaxVariableTransformTreeDepth = 2;
+// Max_Tx_Depth array from section 5.11.5 in the spec with the following
+// modification: If the element is not zero, it is subtracted by one. That is
+// the only way in which this array is being used.
+constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = {
+ 0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3};
+
+constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+ kTransformSize64x64};
+
+TransformSize GetSquareTransformSize(uint8_t pixels) {
+ switch (pixels) {
+ case 128:
+ case 64:
+ return kTransformSize64x64;
+ case 32:
+ return kTransformSize32x32;
+ case 16:
+ return kTransformSize16x16;
+ case 8:
+ return kTransformSize8x8;
+ default:
+ return kTransformSize4x4;
+ }
+}
+
+} // namespace
+
+int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip) {
+ if (row4x4 == block.row4x4) {
+ if (!block.top_available[kPlaneY]) return 64;
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(row4x4 - 1, column4x4);
+ if ((ignore_skip || bp_top.skip) && bp_top.is_inter) {
+ return kBlockWidthPixels[bp_top.size];
+ }
+ }
+ return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]];
+}
+
+int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+ bool ignore_skip) {
+ if (column4x4 == block.column4x4) {
+ if (!block.left_available[kPlaneY]) return 64;
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(row4x4, column4x4 - 1);
+ if ((ignore_skip || bp_left.skip) && bp_left.is_inter) {
+ return kBlockHeightPixels[bp_left.size];
+ }
+ }
+ return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]];
+}
+
+TransformSize Tile::ReadFixedTransformSize(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id]) {
+ return kTransformSize4x4;
+ }
+ const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size];
+ const bool allow_select = !bp.skip || !bp.is_inter;
+ if (block.size == kBlock4x4 || !allow_select ||
+ frame_header_.tx_mode != kTxModeSelect) {
+ return max_rect_tx_size;
+ }
+ const int max_tx_width = kTransformWidth[max_rect_tx_size];
+ const int max_tx_height = kTransformHeight[max_rect_tx_size];
+ const int top_width =
+ block.top_available[kPlaneY]
+ ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true)
+ : 0;
+ const int left_height =
+ block.left_available[kPlaneY]
+ ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true)
+ : 0;
+ const auto context = static_cast<int>(top_width >= max_tx_width) +
+ static_cast<int>(left_height >= max_tx_height);
+ const int cdf_index = kTxDepthCdfIndex[block.size];
+ uint16_t* const cdf =
+ symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+ const int tx_depth = (cdf_index == 0)
+ ? static_cast<int>(reader_.ReadSymbol(cdf))
+ : reader_.ReadSymbol<3>(cdf);
+ assert(tx_depth < 3);
+ TransformSize tx_size = max_rect_tx_size;
+ if (tx_depth == 0) return tx_size;
+ tx_size = kSplitTransformSize[tx_size];
+ if (tx_depth == 1) return tx_size;
+ return kSplitTransformSize[tx_size];
+}
+
+void Tile::ReadVariableTransformTree(const Block& block, int row4x4,
+ int column4x4, TransformSize tx_size) {
+ const uint8_t pixels = std::max(block.width, block.height);
+ const TransformSize max_tx_size = GetSquareTransformSize(pixels);
+ const int context_delta = (kNumSquareTransformSizes - 1 -
+ TransformSizeToSquareTransformIndex(max_tx_size)) *
+ 6;
+
+ // Branching factor is 4 and maximum depth is 2. So the maximum stack size
+ // necessary is (4 - 1) + 4 = 7.
+ Stack<TransformTreeNode, 7> stack;
+ stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0));
+
+ do {
+ TransformTreeNode node = stack.Pop();
+ const int tx_width4x4 = kTransformWidth4x4[node.tx_size];
+ const int tx_height4x4 = kTransformHeight4x4[node.tx_size];
+ if (node.tx_size != kTransformSize4x4 &&
+ node.depth != kMaxVariableTransformTreeDepth) {
+ const auto top =
+ static_cast<int>(GetTopTransformWidth(block, node.y, node.x, false) <
+ kTransformWidth[node.tx_size]);
+ const auto left = static_cast<int>(
+ GetLeftTransformHeight(block, node.y, node.x, false) <
+ kTransformHeight[node.tx_size]);
+ const int context =
+ static_cast<int>(max_tx_size > kTransformSize8x8 &&
+ kTransformSizeSquareMax[node.tx_size] !=
+ max_tx_size) *
+ 3 +
+ context_delta + top + left;
+ // tx_split.
+ if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) {
+ const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size];
+ const int step_width4x4 = kTransformWidth4x4[sub_tx_size];
+ const int step_height4x4 = kTransformHeight4x4[sub_tx_size];
+ // The loops have to run in reverse order because we use a stack for
+ // DFS.
+ for (int i = tx_height4x4 - step_height4x4; i >= 0;
+ i -= step_height4x4) {
+ for (int j = tx_width4x4 - step_width4x4; j >= 0;
+ j -= step_width4x4) {
+ if (node.y + i >= frame_header_.rows4x4 ||
+ node.x + j >= frame_header_.columns4x4) {
+ continue;
+ }
+ stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size,
+ node.depth + 1));
+ }
+ }
+ continue;
+ }
+ }
+ // tx_split is false.
+ for (int i = 0; i < tx_height4x4; ++i) {
+ static_assert(sizeof(TransformSize) == 1, "");
+ memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size,
+ tx_width4x4);
+ }
+ } while (!stack.Empty());
+}
+
+void Tile::DecodeTransformSize(const Block& block) {
+ BlockParameters& bp = *block.bp;
+ if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 &&
+ bp.is_inter && !bp.skip &&
+ !frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id]) {
+ const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size];
+ const int tx_width4x4 = kTransformWidth4x4[max_tx_size];
+ const int tx_height4x4 = kTransformHeight4x4[max_tx_size];
+ for (int row = block.row4x4; row < block.row4x4 + block.height4x4;
+ row += tx_height4x4) {
+ for (int column = block.column4x4;
+ column < block.column4x4 + block.width4x4; column += tx_width4x4) {
+ ReadVariableTransformTree(block, row, column, max_tx_size);
+ }
+ }
+ } else {
+ const TransformSize transform_size = ReadFixedTransformSize(block);
+ for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) {
+ static_assert(sizeof(TransformSize) == 1, "");
+ memset(&inter_transform_sizes_[row][block.column4x4], transform_size,
+ block.width4x4);
+ }
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/warp_prediction.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/inter_intra_masks.inc"
+
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
+constexpr int kAngleStep = 3;
+constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
+ 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
+
+// The following modes need both the left_column and top_row for intra
+// prediction. For directional modes left/top requirement is inferred based on
+// the prediction angle. For Dc modes, left/top requirement is inferred based on
+// whether or not left/top is available.
+constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModeSmoothVertical,
+ kPredictionModePaeth);
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+ assert(angle >= 3);
+ assert(angle <= 87);
+ return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+// Maps the block_size to an index as follows:
+// kBlock8x8 => 0.
+// kBlock8x16 => 1.
+// kBlock8x32 => 2.
+// kBlock16x8 => 3.
+// kBlock16x16 => 4.
+// kBlock16x32 => 5.
+// kBlock32x8 => 6.
+// kBlock32x16 => 7.
+// kBlock32x32 => 8.
+int GetWedgeBlockSizeIndex(BlockSize block_size) {
+ assert(block_size >= kBlock8x8);
+ return block_size - kBlock8x8 - static_cast<int>(block_size >= kBlock16x8) -
+ static_cast<int>(block_size >= kBlock32x8);
+}
+
+// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively.
+int GetInterIntraMaskLookupIndex(int dimension) {
+ assert(dimension == 4 || dimension == 8 || dimension == 16 ||
+ dimension == 32);
+ return FloorLog2(dimension) - 2;
+}
+
+// 7.11.2.9.
+int GetIntraEdgeFilterStrength(int width, int height, int filter_type,
+ int delta) {
+ const int sum = width + height;
+ delta = std::abs(delta);
+ if (filter_type == 0) {
+ if (sum <= 8) {
+ if (delta >= 56) return 1;
+ } else if (sum <= 16) {
+ if (delta >= 40) return 1;
+ } else if (sum <= 24) {
+ if (delta >= 32) return 3;
+ if (delta >= 16) return 2;
+ if (delta >= 8) return 1;
+ } else if (sum <= 32) {
+ if (delta >= 32) return 3;
+ if (delta >= 4) return 2;
+ return 1;
+ } else {
+ return 3;
+ }
+ } else {
+ if (sum <= 8) {
+ if (delta >= 64) return 2;
+ if (delta >= 40) return 1;
+ } else if (sum <= 16) {
+ if (delta >= 48) return 2;
+ if (delta >= 20) return 1;
+ } else if (sum <= 24) {
+ if (delta >= 4) return 3;
+ } else {
+ return 3;
+ }
+ }
+ return 0;
+}
+
+// 7.11.2.10.
+bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) {
+ const int sum = width + height;
+ delta = std::abs(delta);
+ // This function should not be called when the prediction angle is 90 or 180.
+ assert(delta != 0);
+ if (delta >= 40) return false;
+ return (filter_type == 1) ? sum <= 8 : sum <= 16;
+}
+
+constexpr uint8_t kQuantizedDistanceWeight[4][2] = {
+ {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}};
+
+constexpr uint8_t kQuantizedDistanceLookup[4][2] = {
+ {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+void GetDistanceWeights(const int distance[2], int weight[2]) {
+ // Note: distance[0] and distance[1] correspond to relative distance
+ // between current frame and reference frame [1] and [0], respectively.
+ const int order = static_cast<int>(distance[0] <= distance[1]);
+ if (distance[0] == 0 || distance[1] == 0) {
+ weight[0] = kQuantizedDistanceLookup[3][order];
+ weight[1] = kQuantizedDistanceLookup[3][1 - order];
+ } else {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ const int weight_0 = kQuantizedDistanceWeight[i][order];
+ const int weight_1 = kQuantizedDistanceWeight[i][1 - order];
+ if (order == 0) {
+ if (distance[0] * weight_0 < distance[1] * weight_1) break;
+ } else {
+ if (distance[0] * weight_0 > distance[1] * weight_1) break;
+ }
+ }
+ weight[0] = kQuantizedDistanceLookup[i][order];
+ weight[1] = kQuantizedDistanceLookup[i][1 - order];
+ }
+}
+
+dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left,
+ bool has_top) {
+ if (mode == kPredictionModeDc) {
+ if (has_left && has_top) {
+ return dsp::kIntraPredictorDc;
+ }
+ if (has_left) {
+ return dsp::kIntraPredictorDcLeft;
+ }
+ if (has_top) {
+ return dsp::kIntraPredictorDcTop;
+ }
+ return dsp::kIntraPredictorDcFill;
+ }
+ switch (mode) {
+ case kPredictionModePaeth:
+ return dsp::kIntraPredictorPaeth;
+ case kPredictionModeSmooth:
+ return dsp::kIntraPredictorSmooth;
+ case kPredictionModeSmoothVertical:
+ return dsp::kIntraPredictorSmoothVertical;
+ case kPredictionModeSmoothHorizontal:
+ return dsp::kIntraPredictorSmoothHorizontal;
+ default:
+ return dsp::kNumIntraPredictors;
+ }
+}
+
+uint8_t* GetStartPoint(Array2DView<uint8_t>* const buffer, const int plane,
+ const int x, const int y, const int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ Array2DView<uint16_t> buffer16(
+ buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer[plane][0][0]));
+ return reinterpret_cast<uint8_t*>(&buffer16[y][x]);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ static_cast<void>(bitdepth);
+ return &buffer[plane][y][x];
+}
+
+int GetPixelPositionFromHighScale(int start, int step, int offset) {
+ return (start + step * offset) >> kScaleSubPixelBits;
+}
+
+dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra,
+ bool is_wedge_inter_intra,
+ int subsampling_x, int subsampling_y) {
+ return (is_inter_intra && !is_wedge_inter_intra)
+ ? dsp.mask_blend[0][/*is_inter_intra=*/true]
+ : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra];
+}
+
+} // namespace
+
+template <typename Pixel>
+void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool has_top_right,
+ bool has_bottom_left, PredictionMode mode,
+ TransformSize tx_size) {
+ const int width = kTransformWidth[tx_size];
+ const int height = kTransformHeight[tx_size];
+ const int x_shift = subsampling_x_[plane];
+ const int y_shift = subsampling_y_[plane];
+ const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1;
+ const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1;
+ // For performance reasons, do not initialize the following two buffers.
+ alignas(kMaxAlignment) Pixel top_row_data[160];
+ alignas(kMaxAlignment) Pixel left_column_data[160];
+#if LIBGAV1_MSAN
+ if (IsDirectionalMode(mode)) {
+ memset(top_row_data, 0, sizeof(top_row_data));
+ memset(left_column_data, 0, sizeof(left_column_data));
+ }
+#endif
+ // Some predictors use |top_row_data| and |left_column_data| with a negative
+ // offset to access pixels to the top-left of the current block. So have some
+ // space before the arrays to allow populating those without having to move
+ // the rest of the array.
+ Pixel* const top_row = top_row_data + 16;
+ Pixel* const left_column = left_column_data + 16;
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const int top_and_left_size = width + height;
+ const bool is_directional_mode = IsDirectionalMode(mode);
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ const bool use_filter_intra =
+ (plane == kPlaneY && prediction_parameters.use_filter_intra);
+ const int prediction_angle =
+ is_directional_mode
+ ? kPredictionModeToAngle[mode] +
+ prediction_parameters.angle_delta[GetPlaneType(plane)] *
+ kAngleStep
+ : 0;
+ // Directional prediction requires buffers larger than the width or height.
+ const int top_size = is_directional_mode ? top_and_left_size : width;
+ const int left_size = is_directional_mode ? top_and_left_size : height;
+ const int top_right_size =
+ is_directional_mode ? (has_top_right ? 2 : 1) * width : width;
+ const int bottom_left_size =
+ is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height;
+
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+ (is_directional_mode && prediction_angle < 180) ||
+ (mode == kPredictionModeDc && has_top);
+ const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+ (is_directional_mode && prediction_angle > 90) ||
+ (mode == kPredictionModeDc && has_left);
+
+ const Pixel* top_row_src = buffer[y - 1];
+
+ // Determine if we need to retrieve the top row from
+ // |intra_prediction_buffer_|.
+ if ((needs_top || needs_left) && use_intra_prediction_buffer_) {
+ // Superblock index of block.row4x4. block.row4x4 is always in luma
+ // dimension (no subsampling).
+ const int current_superblock_index =
+ block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4);
+ // Superblock index of y - 1. y is in the plane dimension (chroma planes
+ // could be subsampled).
+ const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) -
+ subsampling_y_[plane];
+ const int top_row_superblock_index = (y - 1) >> plane_shift;
+ // If the superblock index of y - 1 is not that of the current superblock,
+ // then we will have to retrieve the top row from the
+ // |intra_prediction_buffer_|.
+ if (current_superblock_index != top_row_superblock_index) {
+ top_row_src = reinterpret_cast<const Pixel*>(
+ (*intra_prediction_buffer_)[plane].get());
+ }
+ }
+
+ if (needs_top) {
+ // Compute top_row.
+ if (has_top || has_left) {
+ const int left_index = has_left ? x - 1 : x;
+ top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
+ } else {
+ top_row[-1] = 1 << (bitdepth - 1);
+ }
+ if (!has_top && has_left) {
+ Memset(top_row, buffer[y][x - 1], top_size);
+ } else if (!has_top && !has_left) {
+ Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
+ } else {
+ const int top_limit = std::min(max_x - x + 1, top_right_size);
+ memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
+ // Even though it is safe to call Memset with a size of 0, accessing
+ // top_row_src[top_limit - x + 1] is not allowed when this condition is
+ // false.
+ if (top_size - top_limit > 0) {
+ Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
+ top_size - top_limit);
+ }
+ }
+ }
+ if (needs_left) {
+ // Compute left_column.
+ if (has_top || has_left) {
+ const int left_index = has_left ? x - 1 : x;
+ left_column[-1] =
+ has_top ? top_row_src[left_index] : buffer[y][left_index];
+ } else {
+ left_column[-1] = 1 << (bitdepth - 1);
+ }
+ if (!has_left && has_top) {
+ Memset(left_column, top_row_src[x], left_size);
+ } else if (!has_left && !has_top) {
+ Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
+ } else {
+ const int left_limit = std::min(max_y - y + 1, bottom_left_size);
+ for (int i = 0; i < left_limit; ++i) {
+ left_column[i] = buffer[y + i][x - 1];
+ }
+ // Even though it is safe to call Memset with a size of 0, accessing
+ // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is
+ // false.
+ if (left_size - left_limit > 0) {
+ Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1],
+ left_size - left_limit);
+ }
+ }
+ }
+ Pixel* const dest = &buffer[y][x];
+ const ptrdiff_t dest_stride = buffer_[plane].columns();
+ if (use_filter_intra) {
+ dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column,
+ prediction_parameters.filter_intra_mode, width,
+ height);
+ } else if (is_directional_mode) {
+ DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left,
+ needs_top, prediction_angle, width, height, max_x,
+ max_y, tx_size, top_row, left_column);
+ } else {
+ const dsp::IntraPredictor predictor =
+ GetIntraPredictor(mode, has_left, has_top);
+ assert(predictor != dsp::kNumIntraPredictors);
+ dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row,
+ left_column);
+ }
+}
+
+template void Tile::IntraPrediction<uint8_t>(const Block& block, Plane plane,
+ int x, int y, bool has_left,
+ bool has_top, bool has_top_right,
+ bool has_bottom_left,
+ PredictionMode mode,
+ TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::IntraPrediction<uint16_t>(const Block& block, Plane plane,
+ int x, int y, bool has_left,
+ bool has_top, bool has_top_right,
+ bool has_bottom_left,
+ PredictionMode mode,
+ TransformSize tx_size);
+#endif
+
+int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const {
+ bool top;
+ bool left;
+ if (plane == kPlaneY) {
+ top = block.top_available[kPlaneY] &&
+ kPredictionModeSmoothMask.Contains(block.bp_top->y_mode);
+ left = block.left_available[kPlaneY] &&
+ kPredictionModeSmoothMask.Contains(block.bp_left->y_mode);
+ } else {
+ top = block.top_available[plane] &&
+ block.bp->prediction_parameters->chroma_top_uses_smooth_prediction;
+ left = block.left_available[plane] &&
+ block.bp->prediction_parameters->chroma_left_uses_smooth_prediction;
+ }
+ return static_cast<int>(top || left);
+}
+
+template <typename Pixel>
+void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+ bool has_left, bool has_top, bool needs_left,
+ bool needs_top, int prediction_angle,
+ int width, int height, int max_x, int max_y,
+ TransformSize tx_size, Pixel* const top_row,
+ Pixel* const left_column) {
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ Pixel* const dest = &buffer[y][x];
+ const ptrdiff_t stride = buffer_[plane].columns();
+ if (prediction_angle == 90) {
+ dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical](
+ dest, stride, top_row, left_column);
+ return;
+ }
+ if (prediction_angle == 180) {
+ dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal](
+ dest, stride, top_row, left_column);
+ return;
+ }
+
+ bool upsampled_top = false;
+ bool upsampled_left = false;
+ if (sequence_header_.enable_intra_edge_filter) {
+ const int filter_type = GetIntraEdgeFilterType(block, plane);
+ if (prediction_angle > 90 && prediction_angle < 180 &&
+ (width + height) >= 24) {
+ // 7.11.2.7.
+ left_column[-1] = top_row[-1] = RightShiftWithRounding(
+ left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4);
+ }
+ if (has_top && needs_top) {
+ const int strength = GetIntraEdgeFilterStrength(
+ width, height, filter_type, prediction_angle - 90);
+ if (strength > 0) {
+ const int num_pixels = std::min(width, max_x - x + 1) +
+ ((prediction_angle < 90) ? height : 0) + 1;
+ dsp_.intra_edge_filter(top_row - 1, num_pixels, strength);
+ }
+ }
+ if (has_left && needs_left) {
+ const int strength = GetIntraEdgeFilterStrength(
+ width, height, filter_type, prediction_angle - 180);
+ if (strength > 0) {
+ const int num_pixels = std::min(height, max_y - y + 1) +
+ ((prediction_angle > 180) ? width : 0) + 1;
+ dsp_.intra_edge_filter(left_column - 1, num_pixels, strength);
+ }
+ }
+ upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type,
+ prediction_angle - 90);
+ if (upsampled_top && needs_top) {
+ const int num_pixels = width + ((prediction_angle < 90) ? height : 0);
+ dsp_.intra_edge_upsampler(top_row, num_pixels);
+ }
+ upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type,
+ prediction_angle - 180);
+ if (upsampled_left && needs_left) {
+ const int num_pixels = height + ((prediction_angle > 180) ? width : 0);
+ dsp_.intra_edge_upsampler(left_column, num_pixels);
+ }
+ }
+
+ if (prediction_angle < 90) {
+ const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle);
+ dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height,
+ dx, upsampled_top);
+ } else if (prediction_angle < 180) {
+ const int dx =
+ GetDirectionalIntraPredictorDerivative(180 - prediction_angle);
+ const int dy =
+ GetDirectionalIntraPredictorDerivative(prediction_angle - 90);
+ dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column,
+ width, height, dx, dy, upsampled_top,
+ upsampled_left);
+ } else {
+ assert(prediction_angle < 270);
+ const int dy =
+ GetDirectionalIntraPredictorDerivative(270 - prediction_angle);
+ dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width,
+ height, dy, upsampled_left);
+ }
+}
+
+template <typename Pixel>
+void Tile::PalettePrediction(const Block& block, const Plane plane,
+ const int start_x, const int start_y, const int x,
+ const int y, const TransformSize tx_size) {
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const uint16_t* const palette =
+ block.bp->prediction_parameters->palette_mode_info.color[plane];
+ const PlaneType plane_type = GetPlaneType(plane);
+ const int x4 = MultiplyBy4(x);
+ const int y4 = MultiplyBy4(y);
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ for (int row = 0; row < tx_height; ++row) {
+ assert(block.bp->prediction_parameters
+ ->color_index_map[plane_type][y4 + row] != nullptr);
+ for (int column = 0; column < tx_width; ++column) {
+ buffer[start_y + row][start_x + column] =
+ palette[block.bp->prediction_parameters
+ ->color_index_map[plane_type][y4 + row][x4 + column]];
+ }
+ }
+}
+
+template void Tile::PalettePrediction<uint8_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const int x, const int y, const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::PalettePrediction<uint16_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const int x, const int y, const TransformSize tx_size);
+#endif
+
+template <typename Pixel>
+void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane,
+ const int start_x, const int start_y,
+ const TransformSize tx_size) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ Array2DView<Pixel> y_buffer(
+ buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[kPlaneY][0][0]));
+ if (!block.scratch_buffer->cfl_luma_buffer_valid) {
+ const int luma_x = start_x << subsampling_x;
+ const int luma_y = start_y << subsampling_y;
+ dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y](
+ block.scratch_buffer->cfl_luma_buffer,
+ prediction_parameters.max_luma_width - luma_x,
+ prediction_parameters.max_luma_height - luma_y,
+ reinterpret_cast<uint8_t*>(&y_buffer[luma_y][luma_x]),
+ buffer_[kPlaneY].columns());
+ block.scratch_buffer->cfl_luma_buffer_valid = true;
+ }
+ Array2DView<Pixel> buffer(buffer_[plane].rows(),
+ buffer_[plane].columns() / sizeof(Pixel),
+ reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+ dsp_.cfl_intra_predictors[tx_size](
+ reinterpret_cast<uint8_t*>(&buffer[start_y][start_x]),
+ buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer,
+ (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u
+ : prediction_parameters.cfl_alpha_v);
+}
+
+template void Tile::ChromaFromLumaPrediction<uint8_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::ChromaFromLumaPrediction<uint16_t>(
+ const Block& block, const Plane plane, const int start_x, const int start_y,
+ const TransformSize tx_size);
+#endif
+
+void Tile::InterIntraPrediction(
+ uint16_t* const prediction_0, const uint8_t* const prediction_mask,
+ const ptrdiff_t prediction_mask_stride,
+ const PredictionParameters& prediction_parameters,
+ const int prediction_width, const int prediction_height,
+ const int subsampling_x, const int subsampling_y, uint8_t* const dest,
+ const ptrdiff_t dest_stride) {
+ assert(prediction_mask != nullptr);
+ assert(prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeIntra ||
+ prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge);
+ // The first buffer of InterIntra is from inter prediction.
+ // The second buffer is from intra prediction.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true,
+ prediction_parameters.is_wedge_inter_intra, subsampling_x,
+ subsampling_y)(
+ prediction_0, reinterpret_cast<uint16_t*>(dest),
+ dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride,
+ prediction_width, prediction_height, dest, dest_stride);
+ return;
+ }
+#endif
+ const int function_index = prediction_parameters.is_wedge_inter_intra
+ ? subsampling_x + subsampling_y
+ : 0;
+ // |is_inter_intra| prediction values are stored in a Pixel buffer but it is
+ // currently declared as a uint16_t buffer.
+ // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and
+ // remove the reinterpret_cast.
+ dsp_.inter_intra_mask_blend_8bpp[function_index](
+ reinterpret_cast<uint8_t*>(prediction_0), dest, dest_stride,
+ prediction_mask, prediction_mask_stride, prediction_width,
+ prediction_height);
+}
+
+void Tile::CompoundInterPrediction(
+ const Block& block, const uint8_t* const prediction_mask,
+ const ptrdiff_t prediction_mask_stride, const int prediction_width,
+ const int prediction_height, const int subsampling_x,
+ const int subsampling_y, const int candidate_row,
+ const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) {
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+
+ void* prediction[2];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ if (bitdepth > 8) {
+ prediction[0] = block.scratch_buffer->prediction_buffer[0];
+ prediction[1] = block.scratch_buffer->prediction_buffer[1];
+ } else {
+#endif
+ prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0];
+ prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif
+
+ switch (prediction_parameters.compound_prediction_type) {
+ case kCompoundPredictionTypeWedge:
+ case kCompoundPredictionTypeDiffWeighted:
+ GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false,
+ prediction_parameters.is_wedge_inter_intra,
+ subsampling_x, subsampling_y)(
+ prediction[0], prediction[1],
+ /*prediction_stride=*/prediction_width, prediction_mask,
+ prediction_mask_stride, prediction_width, prediction_height, dest,
+ dest_stride);
+ break;
+ case kCompoundPredictionTypeDistance:
+ DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width,
+ prediction_height, candidate_row,
+ candidate_column, dest, dest_stride);
+ break;
+ default:
+ assert(prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeAverage);
+ dsp_.average_blend(prediction[0], prediction[1], prediction_width,
+ prediction_height, dest, dest_stride);
+ break;
+ }
+}
+
+GlobalMotion* Tile::GetWarpParams(
+ const Block& block, const Plane plane, const int prediction_width,
+ const int prediction_height,
+ const PredictionParameters& prediction_parameters,
+ const ReferenceFrameType reference_type, bool* const is_local_valid,
+ GlobalMotion* const global_motion_params,
+ GlobalMotion* const local_warp_params) const {
+ if (prediction_width < 8 || prediction_height < 8 ||
+ frame_header_.force_integer_mv == 1) {
+ return nullptr;
+ }
+ if (plane == kPlaneY) {
+ *is_local_valid =
+ prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+ WarpEstimation(
+ prediction_parameters.num_warp_samples, DivideBy4(prediction_width),
+ DivideBy4(prediction_height), block.row4x4, block.column4x4,
+ block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates,
+ local_warp_params) &&
+ SetupShear(local_warp_params);
+ }
+ if (prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+ *is_local_valid) {
+ return local_warp_params;
+ }
+ if (!IsScaled(reference_type)) {
+ GlobalMotionTransformationType global_motion_type =
+ (reference_type != kReferenceFrameIntra)
+ ? global_motion_params->type
+ : kNumGlobalMotionTransformationTypes;
+ const bool is_global_valid =
+ IsGlobalMvBlock(*block.bp, global_motion_type) &&
+ SetupShear(global_motion_params);
+ // Valid global motion type implies reference type can't be intra.
+ assert(!is_global_valid || reference_type != kReferenceFrameIntra);
+ if (is_global_valid) return global_motion_params;
+ }
+ return nullptr;
+}
+
+bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
+ const int y, const int prediction_width,
+ const int prediction_height, int candidate_row,
+ int candidate_column, bool* const is_local_valid,
+ GlobalMotion* const local_warp_params) {
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const BlockParameters& bp = *block.bp;
+ const BlockParameters& bp_reference =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const bool is_compound =
+ bp_reference.reference_frame[1] > kReferenceFrameIntra;
+ assert(bp.is_inter);
+ const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra;
+
+ const PredictionParameters& prediction_parameters =
+ *block.bp->prediction_parameters;
+ uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth);
+ const ptrdiff_t dest_stride = buffer_[plane].columns(); // In bytes.
+ for (int index = 0; index < 1 + static_cast<int>(is_compound); ++index) {
+ const ReferenceFrameType reference_type =
+ bp_reference.reference_frame[index];
+ GlobalMotion global_motion_params =
+ frame_header_.global_motion[reference_type];
+ GlobalMotion* warp_params =
+ GetWarpParams(block, plane, prediction_width, prediction_height,
+ prediction_parameters, reference_type, is_local_valid,
+ &global_motion_params, local_warp_params);
+ if (warp_params != nullptr) {
+ if (!BlockWarpProcess(block, plane, index, x, y, prediction_width,
+ prediction_height, warp_params, is_compound,
+ is_inter_intra, dest, dest_stride)) {
+ return false;
+ }
+ } else {
+ const int reference_index =
+ prediction_parameters.use_intra_block_copy
+ ? -1
+ : frame_header_.reference_frame_index[reference_type -
+ kReferenceFrameLast];
+ if (!BlockInterPrediction(
+ block, plane, reference_index, bp_reference.mv.mv[index], x, y,
+ prediction_width, prediction_height, candidate_row,
+ candidate_column, block.scratch_buffer->prediction_buffer[index],
+ is_compound, is_inter_intra, dest, dest_stride)) {
+ return false;
+ }
+ }
+ }
+
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ ptrdiff_t prediction_mask_stride = 0;
+ const uint8_t* prediction_mask = nullptr;
+ if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeWedge) {
+ const Array2D<uint8_t>& wedge_mask =
+ wedge_masks_[GetWedgeBlockSizeIndex(block.size)]
+ [prediction_parameters.wedge_sign]
+ [prediction_parameters.wedge_index];
+ prediction_mask = wedge_mask[0];
+ prediction_mask_stride = wedge_mask.columns();
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeIntra) {
+ // 7.11.3.13. The inter intra masks are precomputed and stored as a set of
+ // look up tables.
+ assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes);
+ prediction_mask =
+ kInterIntraMasks[prediction_parameters.inter_intra_mode]
+ [GetInterIntraMaskLookupIndex(prediction_width)]
+ [GetInterIntraMaskLookupIndex(prediction_height)];
+ prediction_mask_stride = prediction_width;
+ } else if (prediction_parameters.compound_prediction_type ==
+ kCompoundPredictionTypeDiffWeighted) {
+ if (plane == kPlaneY) {
+ assert(prediction_width >= 8);
+ assert(prediction_height >= 8);
+ dsp_.weight_mask[FloorLog2(prediction_width) - 3]
+ [FloorLog2(prediction_height) - 3]
+ [static_cast<int>(prediction_parameters.mask_is_inverse)](
+ block.scratch_buffer->prediction_buffer[0],
+ block.scratch_buffer->prediction_buffer[1],
+ block.scratch_buffer->weight_mask, block.width);
+ }
+ prediction_mask = block.scratch_buffer->weight_mask;
+ prediction_mask_stride = block.width;
+ }
+
+ if (is_compound) {
+ CompoundInterPrediction(block, prediction_mask, prediction_mask_stride,
+ prediction_width, prediction_height, subsampling_x,
+ subsampling_y, candidate_row, candidate_column,
+ dest, dest_stride);
+ } else if (prediction_parameters.motion_mode == kMotionModeObmc) {
+ // Obmc mode is allowed only for single reference (!is_compound).
+ return ObmcPrediction(block, plane, prediction_width, prediction_height);
+ } else if (is_inter_intra) {
+ // InterIntra and obmc must be mutually exclusive.
+ InterIntraPrediction(
+ block.scratch_buffer->prediction_buffer[0], prediction_mask,
+ prediction_mask_stride, prediction_parameters, prediction_width,
+ prediction_height, subsampling_x, subsampling_y, dest, dest_stride);
+ }
+ return true;
+}
+
+bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+ const Plane plane,
+ const int reference_frame_index, const int width,
+ const int height, const int x, const int y,
+ const int candidate_row,
+ const int candidate_column,
+ const ObmcDirection blending_direction) {
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ // Obmc's prediction needs to be clipped before blending with above/left
+ // prediction blocks.
+ // Obmc prediction is used only when is_compound is false. So it is safe to
+ // use prediction_buffer[1] as a temporary buffer for the Obmc prediction.
+ static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >=
+ 64 * 64 * sizeof(uint16_t),
+ "");
+ auto* const obmc_buffer =
+ reinterpret_cast<uint8_t*>(block.scratch_buffer->prediction_buffer[1]);
+ const ptrdiff_t obmc_buffer_stride =
+ (bitdepth == 8) ? width : width * sizeof(uint16_t);
+ if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y,
+ width, height, candidate_row, candidate_column,
+ nullptr, false, false, obmc_buffer,
+ obmc_buffer_stride)) {
+ return false;
+ }
+
+ uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth);
+ const ptrdiff_t prediction_stride = buffer_[plane].columns();
+ dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width,
+ height, obmc_buffer, obmc_buffer_stride);
+ return true;
+}
+
+bool Tile::ObmcPrediction(const Block& block, const Plane plane,
+ const int width, const int height) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ if (block.top_available[kPlaneY] &&
+ !IsBlockSmallerThan8x8(block.residual_size[plane])) {
+ const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]);
+ const int column4x4_max =
+ std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+ const int candidate_row = block.row4x4 - 1;
+ const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y;
+ int column4x4 = block.column4x4;
+ const int prediction_height = std::min(height >> 1, 32 >> subsampling_y);
+ for (int i = 0, step; i < num_limit && column4x4 < column4x4_max;
+ column4x4 += step) {
+ const int candidate_column = column4x4 | 1;
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const int candidate_block_size = bp_top.size;
+ step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16);
+ if (bp_top.reference_frame[0] > kReferenceFrameIntra) {
+ i++;
+ const int candidate_reference_frame_index =
+ frame_header_.reference_frame_index[bp_top.reference_frame[0] -
+ kReferenceFrameLast];
+ const int prediction_width =
+ std::min(width, MultiplyBy4(step) >> subsampling_x);
+ if (!ObmcBlockPrediction(
+ block, bp_top.mv.mv[0], plane, candidate_reference_frame_index,
+ prediction_width, prediction_height,
+ MultiplyBy4(column4x4) >> subsampling_x, block_start_y,
+ candidate_row, candidate_column, kObmcDirectionVertical)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ if (block.left_available[kPlaneY]) {
+ const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]);
+ const int row4x4_max =
+ std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+ const int candidate_column = block.column4x4 - 1;
+ int row4x4 = block.row4x4;
+ const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x;
+ const int prediction_width = std::min(width >> 1, 32 >> subsampling_x);
+ for (int i = 0, step; i < num_limit && row4x4 < row4x4_max;
+ row4x4 += step) {
+ const int candidate_row = row4x4 | 1;
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ const int candidate_block_size = bp_left.size;
+ step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16);
+ if (bp_left.reference_frame[0] > kReferenceFrameIntra) {
+ i++;
+ const int candidate_reference_frame_index =
+ frame_header_.reference_frame_index[bp_left.reference_frame[0] -
+ kReferenceFrameLast];
+ const int prediction_height =
+ std::min(height, MultiplyBy4(step) >> subsampling_y);
+ if (!ObmcBlockPrediction(
+ block, bp_left.mv.mv[0], plane, candidate_reference_frame_index,
+ prediction_width, prediction_height, block_start_x,
+ MultiplyBy4(row4x4) >> subsampling_y, candidate_row,
+ candidate_column, kObmcDirectionHorizontal)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+ const int width, const int height,
+ const int candidate_row,
+ const int candidate_column, uint8_t* dest,
+ ptrdiff_t dest_stride) {
+ int distance[2];
+ int weight[2];
+ for (int reference = 0; reference < 2; ++reference) {
+ const BlockParameters& bp =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ // Note: distance[0] and distance[1] correspond to relative distance
+ // between current frame and reference frame [1] and [0], respectively.
+ distance[1 - reference] = std::min(
+ std::abs(static_cast<int>(
+ current_frame_.reference_info()
+ ->relative_distance_from[bp.reference_frame[reference]])),
+ static_cast<int>(kMaxFrameDistance));
+ }
+ GetDistanceWeights(distance, weight);
+
+ dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1],
+ width, height, dest, dest_stride);
+}
+
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+ const int reference_frame_index, const int x,
+ const int y, int* const start_x,
+ int* const start_y, int* const step_x,
+ int* const step_y) {
+ const int reference_upscaled_width =
+ (reference_frame_index == -1)
+ ? frame_header_.upscaled_width
+ : reference_frames_[reference_frame_index]->upscaled_width();
+ const int reference_height =
+ (reference_frame_index == -1)
+ ? frame_header_.height
+ : reference_frames_[reference_frame_index]->frame_height();
+ assert(2 * frame_header_.width >= reference_upscaled_width &&
+ 2 * frame_header_.height >= reference_height &&
+ frame_header_.width <= 16 * reference_upscaled_width &&
+ frame_header_.height <= 16 * reference_height);
+ const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+ const bool is_scaled_y = reference_height != frame_header_.height;
+ const int half_sample = 1 << (kSubPixelBits - 1);
+ int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+ int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+ const int rounding_offset =
+ DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+ if (is_scaled_x) {
+ const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+ DivideBy2(frame_header_.width)) /
+ frame_header_.width;
+ *step_x = RightShiftWithRoundingSigned(
+ scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_x += half_sample;
+ // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+ // be up to 15 bits. So we use int64_t to hold base_x.
+ const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+ (half_sample << kReferenceScaleShift);
+ *start_x =
+ RightShiftWithRoundingSigned(
+ base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_x = 1 << kScaleSubPixelBits;
+ *start_x = LeftShift(orig_x, 6) + rounding_offset;
+ }
+ if (is_scaled_y) {
+ const int scale_y = ((reference_height << kReferenceScaleShift) +
+ DivideBy2(frame_header_.height)) /
+ frame_header_.height;
+ *step_y = RightShiftWithRoundingSigned(
+ scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_y += half_sample;
+ const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+ (half_sample << kReferenceScaleShift);
+ *start_y =
+ RightShiftWithRoundingSigned(
+ base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_y = 1 << kScaleSubPixelBits;
+ *start_y = LeftShift(orig_y, 6) + rounding_offset;
+ }
+}
+
+// static.
+bool Tile::GetReferenceBlockPosition(
+ const int reference_frame_index, const bool is_scaled, const int width,
+ const int height, const int ref_start_x, const int ref_last_x,
+ const int ref_start_y, const int ref_last_y, const int start_x,
+ const int start_y, const int step_x, const int step_y,
+ const int left_border, const int right_border, const int top_border,
+ const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
+ int* ref_block_end_x, int* ref_block_end_y) {
+ *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
+ *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
+ if (reference_frame_index == -1) {
+ return false;
+ }
+ *ref_block_start_x -= kConvolveBorderLeftTop;
+ *ref_block_start_y -= kConvolveBorderLeftTop;
+ *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
+ kConvolveBorderRight;
+ *ref_block_end_y =
+ GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
+ kConvolveBorderBottom;
+ if (is_scaled) {
+ const int block_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight;
+ *ref_block_end_y = *ref_block_start_y + block_height - 1;
+ }
+ // Determines if we need to extend beyond the left/right/top/bottom border.
+ return *ref_block_start_x < (ref_start_x - left_border) ||
+ *ref_block_end_x > (ref_last_x + right_border) ||
+ *ref_block_start_y < (ref_start_y - top_border) ||
+ *ref_block_end_y > (ref_last_y + bottom_border);
+}
+
+// Builds a block as the input for convolve, by copying the content of
+// reference frame (either a decoded reference frame, or current frame).
+// |block_extended_width| is the combined width of the block and its borders.
+template <typename Pixel>
+void Tile::BuildConvolveBlock(
+ const Plane plane, const int reference_frame_index, const bool is_scaled,
+ const int height, const int ref_start_x, const int ref_last_x,
+ const int ref_start_y, const int ref_last_y, const int step_y,
+ const int ref_block_start_x, const int ref_block_end_x,
+ const int ref_block_start_y, uint8_t* block_buffer,
+ ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) {
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ Array2DView<const Pixel> reference_block(
+ reference_buffer->height(plane),
+ reference_buffer->stride(plane) / sizeof(Pixel),
+ reinterpret_cast<const Pixel*>(reference_buffer->data(plane)));
+ auto* const block_head = reinterpret_cast<Pixel*>(block_buffer);
+ convolve_buffer_stride /= sizeof(Pixel);
+ int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom;
+ if (is_scaled) {
+ block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ kSubPixelTaps;
+ }
+ const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+ const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+ const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
+ const int block_width = copy_end_x - copy_start_x + 1;
+ const bool extend_left = ref_block_start_x < ref_start_x;
+ const bool extend_right = ref_block_end_x > ref_last_x;
+ const bool out_of_left = copy_start_x > ref_block_end_x;
+ const bool out_of_right = copy_end_x < ref_block_start_x;
+ if (out_of_left || out_of_right) {
+ const int ref_x = out_of_left ? copy_start_x : copy_end_x;
+ Pixel* buf_ptr = block_head;
+ for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+ Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width);
+ if (ref_block_start_y + y >= ref_start_y &&
+ ref_block_start_y + y < ref_last_y) {
+ ++ref_y;
+ }
+ buf_ptr += convolve_buffer_stride;
+ }
+ } else {
+ Pixel* buf_ptr = block_head;
+ const int left_width = copy_start_x - ref_block_start_x;
+ for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+ if (extend_left) {
+ Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width);
+ }
+ memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x],
+ block_width * sizeof(Pixel));
+ if (extend_right) {
+ Memset(buf_ptr + left_width + block_width,
+ reference_block[ref_y][copy_end_x],
+ block_extended_width - left_width - block_width);
+ }
+ if (ref_block_start_y + y >= ref_start_y &&
+ ref_block_start_y + y < ref_last_y) {
+ ++ref_y;
+ }
+ buf_ptr += convolve_buffer_stride;
+ }
+ }
+}
+
+bool Tile::BlockInterPrediction(
+ const Block& block, const Plane plane, const int reference_frame_index,
+ const MotionVector& mv, const int x, const int y, const int width,
+ const int height, const int candidate_row, const int candidate_column,
+ uint16_t* const prediction, const bool is_compound,
+ const bool is_inter_intra, uint8_t* const dest,
+ const ptrdiff_t dest_stride) {
+ const BlockParameters& bp =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ int start_x;
+ int start_y;
+ int step_x;
+ int step_y;
+ ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y,
+ &step_x, &step_y);
+ const int horizontal_filter_index = bp.interpolation_filter[1];
+ const int vertical_filter_index = bp.interpolation_filter[0];
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ // reference_frame_index equal to -1 indicates using current frame as
+ // reference.
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ const int reference_upscaled_width =
+ (reference_frame_index == -1)
+ ? MultiplyBy4(frame_header_.columns4x4)
+ : reference_frames_[reference_frame_index]->upscaled_width();
+ const int reference_height =
+ (reference_frame_index == -1)
+ ? MultiplyBy4(frame_header_.rows4x4)
+ : reference_frames_[reference_frame_index]->frame_height();
+ const int ref_start_x = 0;
+ const int ref_last_x =
+ SubsampledValue(reference_upscaled_width, subsampling_x) - 1;
+ const int ref_start_y = 0;
+ const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1;
+
+ const bool is_scaled = (reference_frame_index != -1) &&
+ (frame_header_.width != reference_upscaled_width ||
+ frame_header_.height != reference_height);
+ const int bitdepth = sequence_header_.color_config.bitdepth;
+ const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ int ref_block_start_x;
+ int ref_block_start_y;
+ int ref_block_end_x;
+ int ref_block_end_y;
+ const bool extend_block = GetReferenceBlockPosition(
+ reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
+ ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
+ reference_buffer->left_border(plane),
+ reference_buffer->right_border(plane),
+ reference_buffer->top_border(plane),
+ reference_buffer->bottom_border(plane), &ref_block_start_x,
+ &ref_block_start_y, &ref_block_end_x, &ref_block_end_y);
+
+ // In frame parallel mode, ensure that the reference block has been decoded
+ // and available for referencing.
+ if (reference_frame_index != -1 && frame_parallel_) {
+ // For U and V planes with subsampling, we need to multiply the value of
+ // ref_block_end_y by 2 since we only track the progress of the Y planes.
+ const int reference_y_max = LeftShift(
+ std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y);
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
+ return false;
+ }
+ }
+
+ const uint8_t* block_start = nullptr;
+ ptrdiff_t convolve_buffer_stride;
+ if (!extend_block) {
+ const YuvBuffer* const reference_buffer =
+ (reference_frame_index == -1)
+ ? current_frame_.buffer()
+ : reference_frames_[reference_frame_index]->buffer();
+ convolve_buffer_stride = reference_buffer->stride(plane);
+ if (reference_frame_index == -1 || is_scaled) {
+ block_start = reference_buffer->data(plane) +
+ ref_block_start_y * reference_buffer->stride(plane) +
+ ref_block_start_x * pixel_size;
+ } else {
+ block_start = reference_buffer->data(plane) +
+ (ref_block_start_y + kConvolveBorderLeftTop) *
+ reference_buffer->stride(plane) +
+ (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size;
+ }
+ } else {
+ const int border_right =
+ is_scaled ? kConvolveScaleBorderRight : kConvolveBorderRight;
+ // The block width can be at most 2 times as much as current
+ // block's width because of scaling.
+ auto block_extended_width = Align<ptrdiff_t>(
+ (2 * width + kConvolveBorderLeftTop + border_right) * pixel_size,
+ kMaxAlignment);
+ convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ BuildConvolveBlock<uint16_t>(
+ plane, reference_frame_index, is_scaled, height, ref_start_x,
+ ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+ ref_block_end_x, ref_block_start_y,
+ block.scratch_buffer->convolve_block_buffer.get(),
+ convolve_buffer_stride, block_extended_width);
+ } else {
+#endif
+ BuildConvolveBlock<uint8_t>(
+ plane, reference_frame_index, is_scaled, height, ref_start_x,
+ ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+ ref_block_end_x, ref_block_start_y,
+ block.scratch_buffer->convolve_block_buffer.get(),
+ convolve_buffer_stride, block_extended_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif
+ block_start = block.scratch_buffer->convolve_block_buffer.get() +
+ (is_scaled ? 0
+ : kConvolveBorderLeftTop * convolve_buffer_stride +
+ kConvolveBorderLeftTop * pixel_size);
+ }
+
+ void* const output =
+ (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
+ ptrdiff_t output_stride = (is_compound || is_inter_intra)
+ ? /*prediction_stride=*/width
+ : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // |is_inter_intra| calculations are written to the |prediction| buffer.
+ // Unlike the |is_compound| calculations the output is Pixel and not uint16_t.
+ // convolve_func() expects |output_stride| to be in bytes and not Pixels.
+ // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to
+ // account for this.
+ if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+ output_stride *= 2;
+ }
+#endif
+ assert(output != nullptr);
+ if (is_scaled) {
+ dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound];
+ assert(convolve_func != nullptr);
+
+ convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+ vertical_filter_index, start_x, start_y, step_x, step_y,
+ width, height, output, output_stride);
+ } else {
+ const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+ const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
+ dsp::ConvolveFunc convolve_func =
+ dsp_.convolve[reference_frame_index == -1][is_compound]
+ [vertical_filter_id != 0][horizontal_filter_id != 0];
+ assert(convolve_func != nullptr);
+
+ convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+ vertical_filter_index, horizontal_filter_id,
+ vertical_filter_id, width, height, output, output_stride);
+ }
+ return true;
+}
+
+bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
+ const int index, const int block_start_x,
+ const int block_start_y, const int width,
+ const int height, GlobalMotion* const warp_params,
+ const bool is_compound, const bool is_inter_intra,
+ uint8_t* const dest, const ptrdiff_t dest_stride) {
+ assert(width >= 8 && height >= 8);
+ const BlockParameters& bp = *block.bp;
+ const int reference_frame_index =
+ frame_header_.reference_frame_index[bp.reference_frame[index] -
+ kReferenceFrameLast];
+ const uint8_t* const source =
+ reference_frames_[reference_frame_index]->buffer()->data(plane);
+ ptrdiff_t source_stride =
+ reference_frames_[reference_frame_index]->buffer()->stride(plane);
+ const int source_width =
+ reference_frames_[reference_frame_index]->buffer()->width(plane);
+ const int source_height =
+ reference_frames_[reference_frame_index]->buffer()->height(plane);
+ uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index];
+
+ // In frame parallel mode, ensure that the reference block has been decoded
+ // and available for referencing.
+ if (frame_parallel_) {
+ int reference_y_max = -1;
+ // Find out the maximum y-coordinate for warping.
+ for (int start_y = block_start_y; start_y < block_start_y + height;
+ start_y += 8) {
+ for (int start_x = block_start_x; start_x < block_start_x + width;
+ start_x += 8) {
+ const int src_x = (start_x + 4) << subsampling_x_[plane];
+ const int src_y = (start_y + 4) << subsampling_y_[plane];
+ const int64_t dst_y =
+ src_x * warp_params->params[4] +
+ static_cast<int64_t>(src_y) * warp_params->params[5] +
+ warp_params->params[1];
+ const int64_t y4 = dst_y >> subsampling_y_[plane];
+ const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits);
+ reference_y_max = std::max(iy4 + 8, reference_y_max);
+ }
+ }
+ // For U and V planes with subsampling, we need to multiply reference_y_max
+ // by 2 since we only track the progress of Y planes.
+ reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
+ if (reference_frame_progress_cache_[reference_frame_index] <
+ reference_y_max &&
+ !reference_frames_[reference_frame_index]->WaitUntil(
+ reference_y_max,
+ &reference_frame_progress_cache_[reference_frame_index])) {
+ return false;
+ }
+ }
+ if (is_compound) {
+ dsp_.warp_compound(source, source_stride, source_width, source_height,
+ warp_params->params, subsampling_x_[plane],
+ subsampling_y_[plane], block_start_x, block_start_y,
+ width, height, warp_params->alpha, warp_params->beta,
+ warp_params->gamma, warp_params->delta, prediction,
+ /*prediction_stride=*/width);
+ } else {
+ void* const output = is_inter_intra ? static_cast<void*>(prediction) : dest;
+ ptrdiff_t output_stride =
+ is_inter_intra ? /*prediction_stride=*/width : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ // |is_inter_intra| calculations are written to the |prediction| buffer.
+ // Unlike the |is_compound| calculations the output is Pixel and not
+ // uint16_t. warp_clip() expects |output_stride| to be in bytes and not
+ // Pixels. |prediction_stride| is in units of uint16_t. Adjust
+ // |output_stride| to account for this.
+ if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+ output_stride *= 2;
+ }
+#endif
+ dsp_.warp(source, source_stride, source_width, source_height,
+ warp_params->params, subsampling_x_[plane], subsampling_y_[plane],
+ block_start_x, block_start_y, width, height, warp_params->alpha,
+ warp_params->beta, warp_params->gamma, warp_params->delta, output,
+ output_stride);
+ }
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/motion_vector.h"
+#include "src/reconstruction.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+// Range above kNumQuantizerBaseLevels which the exponential golomb coding
+// process is activated.
+constexpr int kQuantizerCoefficientBaseRange = 12;
+constexpr int kNumQuantizerBaseLevels = 2;
+constexpr int kCoeffBaseRangeMaxIterations =
+ kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
+constexpr int kEntropyContextLeft = 0;
+constexpr int kEntropyContextTop = 1;
+
+constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
+ {2, 4, 4, 4, 5},
+ {2, 4, 4, 4, 5},
+ {2, 4, 4, 4, 5},
+ {3, 5, 5, 5, 6}};
+
+// The space complexity of DFS is O(branching_factor * max_depth). For the
+// parameter tree, branching_factor = 4 (there could be up to 4 children for
+// every node) and max_depth (excluding the root) = 5 (to go from a 128x128
+// block all the way to a 4x4 block). The worse-case stack size is 16, by
+// counting the number of 'o' nodes in the diagram:
+//
+// | 128x128 The highest level (corresponding to the
+// | root of the tree) has no node in the stack.
+// |-----------------+
+// | | | |
+// | o o o 64x64
+// |
+// |-----------------+
+// | | | |
+// | o o o 32x32 Higher levels have three nodes in the stack,
+// | because we pop one node off the stack before
+// |-----------------+ pushing its four children onto the stack.
+// | | | |
+// | o o o 16x16
+// |
+// |-----------------+
+// | | | |
+// | o o o 8x8
+// |
+// |-----------------+
+// | | | |
+// o o o o 4x4 Only the lowest level has four nodes in the
+// stack.
+constexpr int kDfsStackSize = 16;
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+ BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F),
+ BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+constexpr PredictionMode
+ kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
+ kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+ kPredictionModeD157, kPredictionModeDc};
+
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+ kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+ kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+ kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+ kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+ kPredictionModeNewNewMv);
+
+// This is computed as:
+// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
+constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
+ 0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
+
+/* clang-format off */
+constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
+ {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
+ {0, 0, 0, 0, 0}},
+ {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+ {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+ {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+ {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+ {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+ {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+ {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
+/* clang-format on */
+
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+ 26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
+
+constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
+ kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+ kPredictionModeSmooth};
+
+// Number of horizontal luma samples before intra block copy can be used.
+constexpr int kIntraBlockCopyDelayPixels = 256;
+// Number of 64 by 64 blocks before intra block copy can be used.
+constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
+
+// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
+// height 1 << (j + 2).
+constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
+ {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kNumTransformSizes, kNumTransformSizes},
+ {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kNumTransformSizes},
+ {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
+ kTransformSize16x32, kTransformSize16x64},
+ {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
+ kTransformSize32x32, kTransformSize32x64},
+ {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64}};
+
+// Defined in section 9.3 of the spec.
+constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
+ kTransformTypeDctDct, kTransformTypeDctAdst, kTransformTypeAdstDct,
+ kTransformTypeDctDct, kTransformTypeAdstAdst, kTransformTypeDctAdst,
+ kTransformTypeAdstDct, kTransformTypeAdstDct, kTransformTypeDctAdst,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct,
+ kTransformTypeAdstAdst, kTransformTypeDctDct};
+
+// Defined in section 5.11.47 of the spec. This array does not contain an entry
+// for kTransformSetDctOnly, so the first dimension needs to be
+// |kNumTransformSets| - 1.
+constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
+ {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+ kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+ {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+ kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+ {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
+ kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
+ kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+ kTransformTypeAdstFlipadst},
+ {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
+ kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+ kTransformTypeAdstFlipadst},
+ {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
+
+// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
+constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32};
+
+// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
+// transforms replaced with *x32 and 32x* respectively.
+constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+ kTransformSize32x32};
+
+// ith entry of this array is computed as:
+// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
+// TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
+// 1)
+constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
+ 0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
+
+constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
+
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
+
+// Maps compound prediction modes into single modes. For e.g.
+// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
+// and kPredictionModeNewMv for index 1. It is used to simplify the logic in
+// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
+constexpr PredictionMode
+ kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
+ {kPredictionModeNearestMv, kPredictionModeNearestMv},
+ {kPredictionModeNearMv, kPredictionModeNearMv},
+ {kPredictionModeNearestMv, kPredictionModeNewMv},
+ {kPredictionModeNewMv, kPredictionModeNearestMv},
+ {kPredictionModeNearMv, kPredictionModeNewMv},
+ {kPredictionModeNewMv, kPredictionModeNearMv},
+ {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
+ {kPredictionModeNewMv, kPredictionModeNewMv},
+};
+PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
+ if (y_mode < kPredictionModeNearestNearestMv) {
+ return y_mode;
+ }
+ const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
+ assert(lookup_index >= 0);
+ return kCompoundToSinglePredictionMode[lookup_index][index];
+}
+
+// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
+// dqDenom is always a power of two and hence right shift can be used instead of
+// division.
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
+
+// Returns the minimum of |length| or |max|-|start|. This is used to clamp array
+// indices when accessing arrays whose bound is equal to |max|.
+int GetNumElements(int length, int start, int max) {
+ return std::min(length, max - start);
+}
+
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+ // performance.
+ switch (columns) {
+ case 1:
+ MemSetBlock<T>(rows, 1, value, dst, stride);
+ break;
+ case 2:
+ MemSetBlock<T>(rows, 2, value, dst, stride);
+ break;
+ case 4:
+ MemSetBlock<T>(rows, 4, value, dst, stride);
+ break;
+ case 8:
+ MemSetBlock<T>(rows, 8, value, dst, stride);
+ break;
+ default:
+ assert(columns == 16);
+ MemSetBlock<T>(rows, 16, value, dst, stride);
+ break;
+ }
+}
+
+void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
+ TransformType tx_type,
+ TransformType transform_types[32][32]) {
+ const int y_offset = y4 - block.row4x4;
+ const int x_offset = x4 - block.column4x4;
+ TransformType* const dst = &transform_types[y_offset][x_offset];
+ SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+ const MotionVector& mv_to_store, ptrdiff_t stride,
+ int rows, int columns,
+ ReferenceFrameType* reference_frame_row_start,
+ MotionVector* mv) {
+ static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+ do {
+ // Don't switch the following two memory setting functions.
+ // Some ARM CPUs are quite sensitive to the order.
+ memset(reference_frame_row_start, reference_frame_to_store, columns);
+ std::fill(mv, mv + columns, mv_to_store);
+ reference_frame_row_start += stride;
+ mv += stride;
+ } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+ ResidualType* residual) {
+ if (tx_width != 64) return;
+ const int rows = clamped_tx_height - 2;
+ auto* src = residual + 32 * rows;
+ residual += 64 * rows;
+ // Process 2 rows in each loop in reverse order to avoid overwrite.
+ int x = rows >> 1;
+ do {
+ // The 2 rows can be processed in order.
+ memcpy(residual, src, 32 * sizeof(src[0]));
+ memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+ memset(src + 32, 0, 32 * sizeof(src[0]));
+ src -= 64;
+ residual -= 128;
+ } while (--x);
+ // Process the second row. The first row is already correct.
+ memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+ memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+ // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+ // and 5.11.54).
+ constexpr int kMvBorder4x4 = 4;
+ const int row_border = kMvBorder4x4 + block.height4x4;
+ const int column_border = kMvBorder4x4 + block.width4x4;
+ const int macroblocks_to_top_edge = -block.row4x4;
+ const int macroblocks_to_bottom_edge =
+ block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+ const int macroblocks_to_left_edge = -block.column4x4;
+ const int macroblocks_to_right_edge =
+ block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+ min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+ min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+ max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+ max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+ if (index == 0) return 0;
+ const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+ const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+ const int tx_height = kTransformHeight[adjusted_tx_size];
+ if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+ if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+ return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+ TransformClass tx_class) {
+ if (pos == 0) return 0;
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ // This return statement is equivalent to:
+ // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+ // (tx_class == kTransformClassHorizontal && column == 0) ||
+ // (tx_class == kTransformClassVertical && row == 0))
+ // ? 7
+ // : 14;
+ return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+ static_cast<int>((row | column) < 2)) |
+ (tx_class & static_cast<int>(column == 0)) |
+ ((tx_class >> 1) & static_cast<int>(row == 0)));
+}
+
+} // namespace
+
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ RefCountedBuffer* const current_frame, const DecoderState& state,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* const saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids,
+ PostFilter* const post_filter, const dsp::Dsp* const dsp,
+ ThreadPool* const thread_pool,
+ BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer)
+ : number_(tile_number),
+ row_(number_ / frame_header.tile_info.tile_columns),
+ column_(number_ % frame_header.tile_info.tile_columns),
+ data_(data),
+ size_(size),
+ read_deltas_(false),
+ subsampling_x_{0, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_x},
+ subsampling_y_{0, sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.subsampling_y},
+ current_quantizer_index_(frame_header.quantizer.base_index),
+ sequence_header_(sequence_header),
+ frame_header_(frame_header),
+ reference_frame_sign_bias_(state.reference_frame_sign_bias),
+ reference_frames_(state.reference_frame),
+ motion_field_(frame_scratch_buffer->motion_field),
+ reference_order_hint_(state.reference_order_hint),
+ wedge_masks_(wedge_masks),
+ quantizer_matrix_(quantizer_matrix),
+ reader_(data_, size_, frame_header_.enable_cdf_update),
+ symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
+ saved_symbol_decoder_context_(saved_symbol_decoder_context),
+ prev_segment_ids_(prev_segment_ids),
+ dsp_(*dsp),
+ post_filter_(*post_filter),
+ block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
+ quantizer_(sequence_header_.color_config.bitdepth,
+ &frame_header_.quantizer),
+ residual_size_((sequence_header_.color_config.bitdepth == 8)
+ ? sizeof(int16_t)
+ : sizeof(int32_t)),
+ intra_block_copy_lag_(
+ frame_header_.allow_intrabc
+ ? (sequence_header_.use_128x128_superblock ? 3 : 5)
+ : 1),
+ current_frame_(*current_frame),
+ cdef_index_(frame_scratch_buffer->cdef_index),
+ cdef_skip_(frame_scratch_buffer->cdef_skip),
+ inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+ thread_pool_(thread_pool),
+ residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+ tile_scratch_buffer_pool_(
+ &frame_scratch_buffer->tile_scratch_buffer_pool),
+ pending_tiles_(pending_tiles),
+ frame_parallel_(frame_parallel),
+ use_intra_prediction_buffer_(use_intra_prediction_buffer),
+ intra_prediction_buffer_(
+ use_intra_prediction_buffer_
+ ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+ : nullptr) {
+ row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
+ row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
+ column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
+ column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
+ superblock_rows_ =
+ (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
+ superblock_columns_ =
+ (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
+ block_width4x4_log2;
+ // If |split_parse_and_decode_| is true, we do the necessary setup for
+ // splitting the parsing and the decoding steps. This is done in the following
+ // two cases:
+ // 1) If there is multi-threading within a tile (this is done if
+ // |thread_pool_| is not nullptr and if there are at least as many
+ // superblock columns as |intra_block_copy_lag_|).
+ // 2) If |frame_parallel| is true.
+ split_parse_and_decode_ = (thread_pool_ != nullptr &&
+ superblock_columns_ > intra_block_copy_lag_) ||
+ frame_parallel;
+ if (frame_parallel_) {
+ reference_frame_progress_cache_.fill(INT_MIN);
+ }
+ memset(delta_lf_, 0, sizeof(delta_lf_));
+ delta_lf_all_zero_ = true;
+ const YuvBuffer& buffer = post_filter_.frame_buffer();
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ // Verify that the borders are big enough for Reconstruct(). max_tx_length
+ // is the maximum value of tx_width and tx_height for the plane.
+ const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+ // Reconstruct() may overwrite on the right. Since the right border of a
+ // row is followed in memory by the left border of the next row, the
+ // number of extra pixels to the right of a row is at least the sum of the
+ // left and right borders.
+ //
+ // Note: This assertion actually checks the sum of the left and right
+ // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+ // and vertically shifted version of |buffer|. Since the sum of the left and
+ // right borders is not changed by the shift, we can just check the sum of
+ // the left and right borders of |buffer|.
+ assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+ max_tx_length - 1);
+ // Reconstruct() may overwrite on the bottom. We need an extra border row
+ // on the bottom because we need the left border of that row.
+ //
+ // Note: This assertion checks the bottom border of
+ // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+ // shift that the PostFilter constructor applied to |buffer| and reduce the
+ // bottom border by that amount.
+#ifndef NDEBUG
+ const int vertical_shift = static_cast<int>(
+ (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+ buffer.stride(plane));
+ const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+ assert(bottom_border >= max_tx_length);
+#endif
+ // In AV1, a transform block of height H starts at a y coordinate that is
+ // a multiple of H. If a transform block at the bottom of the frame has
+ // height H, then Reconstruct() will write up to the row with index
+ // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+ // rows Reconstruct() may write to is
+ // Align(buffer.height(plane), max_tx_length).
+ buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+ buffer.stride(plane),
+ post_filter_.GetUnfilteredBuffer(plane));
+ }
+}
+
+bool Tile::Init() {
+ assert(coefficient_levels_.size() == dc_categories_.size());
+ for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
+ const int contexts_per_plane = (i == kEntropyContextLeft)
+ ? frame_header_.rows4x4
+ : frame_header_.columns4x4;
+ if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
+ LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
+ return false;
+ }
+ if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
+ LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
+ return false;
+ }
+ }
+ if (split_parse_and_decode_) {
+ assert(residual_buffer_pool_ != nullptr);
+ if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
+ return false;
+ }
+ } else {
+ // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+ // checks when parsing quantized coefficients.
+ residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+ 32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
+ if (residual_buffer_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
+ return false;
+ }
+ prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
+ if (prediction_parameters_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
+ return false;
+ }
+ }
+ if (frame_header_.use_ref_frame_mvs) {
+ assert(sequence_header_.enable_order_hint);
+ SetupMotionField(frame_header_, current_frame_, reference_frames_,
+ row4x4_start_, row4x4_end_, column4x4_start_,
+ column4x4_end_, &motion_field_);
+ }
+ ResetLoopRestorationParams();
+ if (!top_context_.Resize(superblock_columns_)) {
+ LIBGAV1_DLOG(ERROR, "Allocation of top_context_ failed.");
+ return false;
+ }
+ return true;
+}
+
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+ TileScratchBuffer* const scratch_buffer) {
+ if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+ assert(scratch_buffer != nullptr);
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+ column4x4 += block_width4x4) {
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
+ processing_mode)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ }
+ if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+ SaveSymbolDecoderContext();
+ }
+ if (processing_mode == kProcessingModeDecodeOnly ||
+ processing_mode == kProcessingModeParseAndDecode) {
+ PopulateIntraPredictionBuffer(row4x4);
+ }
+ return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+ if (frame_header_.enable_frame_end_update_cdf &&
+ number_ == frame_header_.tile_info.context_update_id) {
+ *saved_symbol_decoder_context_ = symbol_decoder_context_;
+ }
+}
+
+bool Tile::ParseAndDecode() {
+ if (split_parse_and_decode_) {
+ if (!ThreadedParseAndDecode()) return false;
+ SaveSymbolDecoderContext();
+ return true;
+ }
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4) {
+ if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ row4x4, scratch_buffer.get())) {
+ pending_tiles_->Decrement(false);
+ return false;
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ pending_tiles_->Decrement(true);
+ return true;
+}
+
+bool Tile::Parse() {
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4) {
+ if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+ row4x4, scratch_buffer.get())) {
+ return false;
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ SaveSymbolDecoderContext();
+ return true;
+}
+
+bool Tile::Decode(
+ std::mutex* const mutex, int* const superblock_row_progress,
+ std::condition_variable* const superblock_row_progress_condvar) {
+ const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header_.use_128x128_superblock ? 5 : 4;
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+ row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+ if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get())) {
+ return false;
+ }
+ if (post_filter_.DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile
+ // except for the first 64 columns.
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+ block_width4x4);
+ // If this is the first superblock row of the tile, then we cannot apply
+ // horizontal deblocking here since we don't know if the top row is
+ // available. So it will be done by the calling thread in that case.
+ if (row4x4 != row4x4_start_) {
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile,
+ // column4x4_end may not be a multiple of 16. In that case it is still
+ // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+ // the filters in increments of 64 columns (or 32 columns for chroma
+ // with subsampling).
+ post_filter_.ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ column4x4_start_ + kNum4x4InLoopFilterUnit,
+ column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ }
+ bool notify;
+ {
+ std::unique_lock<std::mutex> lock(*mutex);
+ notify = ++superblock_row_progress[index] ==
+ frame_header_.tile_info.tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ return true;
+}
+
+bool Tile::ThreadedParseAndDecode() {
+ {
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+ return false;
+ }
+ // Account for the parsing job.
+ ++threading_.pending_jobs;
+ }
+
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+
+ // Begin parsing.
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ if (scratch_buffer == nullptr) {
+ pending_tiles_->Decrement(false);
+ LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+ return false;
+ }
+ for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
+ row4x4 += block_width4x4, ++row_index) {
+ for (int column4x4 = column4x4_start_, column_index = 0;
+ column4x4 < column4x4_end_;
+ column4x4 += block_width4x4, ++column_index) {
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeParseOnly)) {
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ threading_.abort = true;
+ break;
+ }
+ std::unique_lock<std::mutex> lock(threading_.mutex);
+ if (threading_.abort) break;
+ threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
+ // Schedule the decoding of this superblock if it is allowed.
+ if (CanDecode(row_index, column_index)) {
+ ++threading_.pending_jobs;
+ threading_.sb_state[row_index][column_index] =
+ kSuperBlockStateScheduled;
+ lock.unlock();
+ thread_pool_->Schedule(
+ [this, row_index, column_index, block_width4x4]() {
+ DecodeSuperBlock(row_index, column_index, block_width4x4);
+ });
+ }
+ }
+ std::lock_guard<std::mutex> lock(threading_.mutex);
+ if (threading_.abort) break;
+ }
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+
+ // We are done parsing. We can return here since the calling thread will make
+ // sure that it waits for all the superblocks to be decoded.
+ //
+ // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+ // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+ // is called.
+ threading_.mutex.lock();
+ const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+ const bool job_succeeded = !threading_.abort;
+ threading_.mutex.unlock();
+ if (no_pending_jobs) {
+ // We are done parsing and decoding this tile.
+ pending_tiles_->Decrement(job_succeeded);
+ }
+ return job_succeeded;
+}
+
+bool Tile::CanDecode(int row_index, int column_index) const {
+ assert(row_index >= 0);
+ assert(column_index >= 0);
+ // If |threading_.sb_state[row_index][column_index]| is not equal to
+ // kSuperBlockStateParsed, then return false. This is ok because if
+ // |threading_.sb_state[row_index][column_index]| is equal to:
+ // kSuperBlockStateNone - then the superblock is not yet parsed.
+ // kSuperBlockStateScheduled - then the superblock is already scheduled for
+ // decode.
+ // kSuperBlockStateDecoded - then the superblock has already been decoded.
+ if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
+ threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
+ return false;
+ }
+ // First superblock has no dependencies.
+ if (row_index == 0 && column_index == 0) {
+ return true;
+ }
+ // Superblocks in the first row only depend on the superblock to the left of
+ // it.
+ if (row_index == 0) {
+ return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
+ }
+ // All other superblocks depend on superblock to the left of it (if one
+ // exists) and superblock to the top right with a lag of
+ // |intra_block_copy_lag_| (if one exists).
+ const int top_right_column_index =
+ std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
+ return threading_.sb_state[row_index - 1][top_right_column_index] ==
+ kSuperBlockStateDecoded &&
+ (column_index == 0 ||
+ threading_.sb_state[row_index][column_index - 1] ==
+ kSuperBlockStateDecoded);
+}
+
+void Tile::DecodeSuperBlock(int row_index, int column_index,
+ int block_width4x4) {
+ const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
+ const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ tile_scratch_buffer_pool_->Get();
+ bool ok = scratch_buffer != nullptr;
+ if (ok) {
+ ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeDecodeOnly);
+ tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+ }
+ std::unique_lock<std::mutex> lock(threading_.mutex);
+ if (ok) {
+ threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
+ // Candidate rows and columns that we could potentially begin the decoding
+ // (if it is allowed to do so). The candidates are:
+ // 1) The superblock to the bottom-left of the current superblock with a
+ // lag of |intra_block_copy_lag_| (or the beginning of the next superblock
+ // row in case there are less than |intra_block_copy_lag_| superblock
+ // columns in the Tile).
+ // 2) The superblock to the right of the current superblock.
+ const int candidate_row_indices[] = {row_index + 1, row_index};
+ const int candidate_column_indices[] = {
+ std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
+ for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
+ ++i) {
+ const int candidate_row_index = candidate_row_indices[i];
+ const int candidate_column_index = candidate_column_indices[i];
+ if (!CanDecode(candidate_row_index, candidate_column_index)) {
+ continue;
+ }
+ ++threading_.pending_jobs;
+ threading_.sb_state[candidate_row_index][candidate_column_index] =
+ kSuperBlockStateScheduled;
+ lock.unlock();
+ thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
+ block_width4x4]() {
+ DecodeSuperBlock(candidate_row_index, candidate_column_index,
+ block_width4x4);
+ });
+ lock.lock();
+ }
+ } else {
+ threading_.abort = true;
+ }
+ // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+ // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+ // is called.
+ const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+ const bool job_succeeded = !threading_.abort;
+ lock.unlock();
+ if (no_pending_jobs) {
+ // We are done parsing and decoding this tile.
+ pending_tiles_->Decrement(job_succeeded);
+ }
+}
+
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+ const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+ if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+ return;
+ }
+ const size_t pixel_size =
+ (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t));
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ const int row_to_copy =
+ (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+ const size_t pixels_to_copy =
+ (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+ subsampling_x_[plane]) *
+ pixel_size;
+ const size_t column_start =
+ MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+ void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ Array2DView<uint16_t> buffer(
+ buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+ start = &buffer[row_to_copy][column_start];
+ } else // NOLINT
+#endif
+ {
+ start = &buffer_[plane][row_to_copy][column_start];
+ }
+ memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+ start, pixels_to_copy);
+ }
+}
+
+int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
+ TransformSize tx_size, int x4, int y4,
+ int w4, int h4) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const BlockSize plane_size = block.residual_size[plane];
+ const int block_width = kBlockWidthPixels[plane_size];
+ const int block_height = kBlockHeightPixels[plane_size];
+
+ int top = 0;
+ int left = 0;
+ const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+ const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+ if (plane == kPlaneY) {
+ if (block_width == tx_width && block_height == tx_height) return 0;
+ const uint8_t* coefficient_levels =
+ &coefficient_levels_[kEntropyContextTop][plane][x4];
+ for (int i = 0; i < num_top_elements; ++i) {
+ top = std::max(top, static_cast<int>(coefficient_levels[i]));
+ }
+ coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+ for (int i = 0; i < num_left_elements; ++i) {
+ left = std::max(left, static_cast<int>(coefficient_levels[i]));
+ }
+ assert(top <= 4);
+ assert(left <= 4);
+ // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
+ // for top and left.
+ return kAllZeroContextsByTopLeft[top][left];
+ }
+ const uint8_t* coefficient_levels =
+ &coefficient_levels_[kEntropyContextTop][plane][x4];
+ const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+ for (int i = 0; i < num_top_elements; ++i) {
+ top |= coefficient_levels[i];
+ top |= dc_categories[i];
+ }
+ coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+ dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+ for (int i = 0; i < num_left_elements; ++i) {
+ left |= coefficient_levels[i];
+ left |= dc_categories[i];
+ }
+ return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
+ 3 * static_cast<int>(block_width * block_height >
+ tx_width * tx_height);
+}
+
+TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
+ const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
+ if (is_inter) {
+ if (frame_header_.reduced_tx_set ||
+ tx_size_square_max == kTransformSize32x32) {
+ return kTransformSetInter3;
+ }
+ if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
+ return kTransformSetInter1;
+ }
+ if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
+ if (frame_header_.reduced_tx_set ||
+ tx_size_square_min == kTransformSize16x16) {
+ return kTransformSetIntra2;
+ }
+ return kTransformSetIntra1;
+}
+
+TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
+ TransformSize tx_size, int block_x,
+ int block_y) {
+ const BlockParameters& bp = *block.bp;
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ if (frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id] ||
+ tx_size_square_max == kTransformSize64x64) {
+ return kTransformTypeDctDct;
+ }
+ if (plane == kPlaneY) {
+ return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
+ }
+ const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+ TransformType tx_type;
+ if (bp.is_inter) {
+ const int x4 =
+ std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
+ const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
+ tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
+ } else {
+ tx_type = kModeToTransformType[bp.prediction_parameters->uv_mode];
+ }
+ return kTransformTypeInSetMask[tx_set].Contains(tx_type)
+ ? tx_type
+ : kTransformTypeDctDct;
+}
+
+void Tile::ReadTransformType(const Block& block, int x4, int y4,
+ TransformSize tx_size) {
+ BlockParameters& bp = *block.bp;
+ const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+
+ TransformType tx_type = kTransformTypeDctDct;
+ if (tx_set != kTransformSetDctOnly &&
+ frame_header_.segmentation.qindex[bp.prediction_parameters->segment_id] >
+ 0) {
+ const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
+ const int cdf_tx_size_index =
+ TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
+ uint16_t* cdf;
+ if (bp.is_inter) {
+ cdf = symbol_decoder_context_
+ .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+ switch (tx_set) {
+ case kTransformSetInter1:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+ break;
+ case kTransformSetInter2:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+ break;
+ default:
+ assert(tx_set == kTransformSetInter3);
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+ break;
+ }
+ } else {
+ const PredictionMode intra_direction =
+ block.bp->prediction_parameters->use_filter_intra
+ ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
+ ->filter_intra_mode]
+ : bp.y_mode;
+ cdf =
+ symbol_decoder_context_
+ .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+ assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+ tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+ ? reader_.ReadSymbol<7>(cdf)
+ : reader_.ReadSymbol<5>(cdf));
+ }
+
+ // This array does not contain an entry for kTransformSetDctOnly, so the
+ // first dimension needs to be offset by 1.
+ tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
+ }
+ SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
+ kTransformHeight4x4[tx_size], tx_type, transform_types_);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ for (int i = eob - 2; i >= 1; --i) {
+ const uint16_t pos = scan[i];
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+ levels[tx_width + 1] + levels[2] +
+ levels[MultiplyBy2(tx_width)];
+ const int context =
+ ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ context += 14 >> static_cast<int>((row | column) < 2);
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ }
+ // Read position 0.
+ {
+ auto* const quantized = &quantized_buffer[0];
+ int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+ level_buffer[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ const int context =
+ std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ }
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ int i = eob - 2;
+ do {
+ const uint16_t pos = scan[i];
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (levels[1] + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[2] + // {0, 2}
+ levels[3] + // {0, 3}
+ ((column + 4 < tx_width) ? levels[4] : 0)); // {0, 4}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[column];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[2])); // {0, 2}
+ if (pos != 0) {
+ context += 14 >> static_cast<int>(column == 0);
+ }
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ } while (--i >= 0);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+ const int tx_width = 1 << adjusted_tx_width_log2;
+ int i = eob - 2;
+ do {
+ const uint16_t pos = scan[i];
+ const int row = pos >> adjusted_tx_width_log2;
+ const int column = pos & (tx_width - 1);
+ auto* const quantized = &quantized_buffer[pos];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (((column + 1 < tx_width) ? levels[1] : 0) + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[MultiplyBy2(tx_width)] + // {2, 0}
+ levels[tx_width * 3] + // {3, 0}
+ levels[MultiplyBy4(tx_width)]); // {4, 0}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[row];
+ int level =
+ reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+ int context =
+ std::min(6, DivideBy2(1 + quantized_column1 + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[MultiplyBy2(tx_width)])); // {2, 0}
+ if (pos != 0) {
+ context += 14 >> static_cast<int>(row == 0);
+ }
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ } while (--i >= 0);
+}
+
+int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+ // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+ int8_t dc_sign = std::accumulate(
+ dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+ dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+ dc_sign = std::accumulate(
+ dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
+ // This return statement is equivalent to:
+ // if (dc_sign < 0) return 1;
+ // if (dc_sign > 0) return 2;
+ // return 0;
+ // And it is better than:
+ // return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
+ return static_cast<int>(dc_sign < 0) +
+ MultiplyBy2(static_cast<int>(dc_sign > 0));
+}
+
+void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+ uint8_t coefficient_level, int8_t dc_category) {
+ const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+ const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+ memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
+ num_top_elements);
+ memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
+ num_top_elements);
+ const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+ const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+ memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
+ coefficient_level, num_left_elements);
+ memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
+ num_left_elements);
+}
+
+template <typename ResidualType, bool is_dc_coefficient>
+bool Tile::ReadSignAndApplyDequantization(
+ const uint16_t* const scan, int i, int q_value,
+ const uint8_t* const quantizer_matrix, int shift, int max_value,
+ uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+ int* const coefficient_level, ResidualType* residual_buffer) {
+ const int pos = is_dc_coefficient ? 0 : scan[i];
+ // If residual_buffer[pos] is zero, then the rest of the function has no
+ // effect.
+ int level = residual_buffer[pos];
+ if (level == 0) return true;
+ const int sign = is_dc_coefficient
+ ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+ : reader_.ReadBit();
+ if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+ int length = 0;
+ bool golomb_length_bit = false;
+ do {
+ golomb_length_bit = reader_.ReadBit() != 0;
+ ++length;
+ if (length > 20) {
+ LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
+ return false;
+ }
+ } while (!golomb_length_bit);
+ int x = 1;
+ for (int i = length - 2; i >= 0; --i) {
+ x = (x << 1) | reader_.ReadBit();
+ }
+ level += x - 1;
+ }
+ if (is_dc_coefficient) {
+ *dc_category = (sign != 0) ? -1 : 1;
+ }
+ level &= 0xfffff;
+ *coefficient_level += level;
+ // Apply dequantization. Step 1 of section 7.12.3 in the spec.
+ int q = q_value;
+ if (quantizer_matrix != nullptr) {
+ q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
+ }
+ // The intermediate multiplication can exceed 32 bits, so it has to be
+ // performed by promoting one of the values to int64_t.
+ int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
+ dequantized_value >>= shift;
+ // At this point:
+ // * |dequantized_value| is always non-negative.
+ // * |sign| can be either 0 or 1.
+ // * min_value = -(max_value + 1).
+ // We need to apply the following:
+ // dequantized_value = sign ? -dequantized_value : dequantized_value;
+ // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+ //
+ // Note that -x == ~(x - 1).
+ //
+ // Now, The above two lines can be done with a std::min and xor as follows:
+ dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+ residual_buffer[pos] = dequantized_value;
+ return true;
+}
+
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
+ int level = 0;
+ for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
+ const int coeff_base_range =
+ reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
+ level += coeff_base_range;
+ if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
+ }
+ return level;
+}
+
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+ int start_x, int start_y,
+ TransformSize tx_size,
+ TransformType* const tx_type) {
+ const int x4 = DivideBy4(start_x);
+ const int y4 = DivideBy4(start_y);
+ const int w4 = kTransformWidth4x4[tx_size];
+ const int h4 = kTransformHeight4x4[tx_size];
+ const int tx_size_context = kTransformSizeContext[tx_size];
+ int context =
+ GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
+ const bool all_zero = reader_.ReadSymbol(
+ symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
+ if (all_zero) {
+ if (plane == kPlaneY) {
+ SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
+ transform_types_);
+ }
+ SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
+ // This is not used in this case, so it can be set to any value.
+ *tx_type = kNumTransformTypes;
+ return 0;
+ }
+ const int tx_width = kTransformWidth[tx_size];
+ const int tx_height = kTransformHeight[tx_size];
+ const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+ const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+ const int tx_padding =
+ (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+ auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+ // Clear padding to avoid bottom boundary checks when parsing quantized
+ // coefficients.
+ memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+ uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+ memset(
+ level_buffer, 0,
+ kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+ tx_padding);
+ const int clamped_tx_height = std::min(tx_height, 32);
+ if (plane == kPlaneY) {
+ ReadTransformType(block, x4, y4, tx_size);
+ }
+ BlockParameters& bp = *block.bp;
+ *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
+ const int eob_multi_size = kEobMultiSizeLookup[tx_size];
+ const PlaneType plane_type = GetPlaneType(plane);
+ const TransformClass tx_class = GetTransformClass(*tx_type);
+ context = static_cast<int>(tx_class != kTransformClass2D);
+ int eob_pt = 1;
+ switch (eob_multi_size) {
+ case 0:
+ eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+ symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
+ break;
+ case 1:
+ eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+ symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
+ break;
+ case 2:
+ eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+ symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
+ break;
+ case 3:
+ eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+ symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
+ break;
+ case 4:
+ eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+ symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
+ break;
+ case 5:
+ eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+ symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
+ break;
+ case 6:
+ default:
+ eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+ symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
+ break;
+ }
+ int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+ if (eob_pt >= 3) {
+ context = eob_pt - 3;
+ const bool eob_extra = reader_.ReadSymbol(
+ symbol_decoder_context_
+ .eob_extra_cdf[tx_size_context][plane_type][context]);
+ if (eob_extra) eob += 1 << (eob_pt - 3);
+ for (int i = 1; i < eob_pt - 2; ++i) {
+ assert(eob_pt - i >= 3);
+ assert(eob_pt <= kEobPt1024SymbolCount);
+ if (reader_.ReadBit() != 0) {
+ eob += 1 << (eob_pt - i - 3);
+ }
+ }
+ }
+ const uint16_t* scan = kScan[tx_class][tx_size];
+ const int clamped_tx_size_context = std::min(tx_size_context, 3);
+ auto coeff_base_range_cdf =
+ symbol_decoder_context_
+ .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
+ // Read the last coefficient.
+ {
+ context = GetCoeffBaseContextEob(tx_size, eob - 1);
+ const uint16_t pos = scan[eob - 1];
+ int level =
+ 1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
+ symbol_decoder_context_
+ .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+ level_buffer[pos] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ level +=
+ ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+ adjusted_tx_width_log2, pos, tx_class)]);
+ }
+ residual[pos] = level;
+ }
+ if (eob > 1) {
+ // Read all the other coefficients.
+ // Lookup used to call the right variant of ReadCoeffBase*() based on the
+ // transform class.
+ static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
+ uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer,
+ uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+ &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+ &Tile::ReadCoeffBaseVertical<ResidualType>};
+ (this->*kGetCoeffBaseFunc[tx_class])(
+ scan, tx_size, adjusted_tx_width_log2, eob,
+ symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+ coeff_base_range_cdf, residual, level_buffer);
+ }
+ const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
+ const int current_quantizer_index =
+ GetQIndex(frame_header_.segmentation,
+ bp.prediction_parameters->segment_id, current_quantizer_index_);
+ const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
+ const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
+ const int shift = kQuantizationShift[tx_size];
+ const uint8_t* const quantizer_matrix =
+ (frame_header_.quantizer.use_matrix &&
+ *tx_type < kTransformTypeIdentityIdentity &&
+ !frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id] &&
+ frame_header_.quantizer.matrix_level[plane] < 15)
+ ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+ [plane_type][adjusted_tx_size]
+ .get()
+ : nullptr;
+ int coefficient_level = 0;
+ int8_t dc_category = 0;
+ uint16_t* const dc_sign_cdf =
+ (residual[0] != 0)
+ ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
+ x4, y4, w4, h4, plane)]
+ : nullptr;
+ assert(scan[0] == 0);
+ if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+ scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+ &dc_category, &coefficient_level, residual)) {
+ return -1;
+ }
+ if (eob > 1) {
+ int i = 1;
+ do {
+ if (!ReadSignAndApplyDequantization<ResidualType,
+ /*is_dc_coefficient=*/false>(
+ scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+ nullptr, &coefficient_level, residual)) {
+ return -1;
+ }
+ } while (++i < eob);
+ MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
+ }
+ SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
+ dc_category);
+ if (split_parse_and_decode_) {
+ *block.residual += tx_width * tx_height * residual_size_;
+ }
+ return eob;
+}
+
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+ do { \
+ if (sequence_header_.color_config.bitdepth > 8) { \
+ function<uint16_t>(__VA_ARGS__); \
+ } else { \
+ function<uint8_t>(__VA_ARGS__); \
+ } \
+ } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+ do { \
+ function<uint8_t>(__VA_ARGS__); \
+ } while (false)
+#endif
+
+bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
+ int base_y, TransformSize tx_size, int x, int y,
+ ProcessingMode mode) {
+ BlockParameters& bp = *block.bp;
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const int start_x = base_x + MultiplyBy4(x);
+ const int start_y = base_y + MultiplyBy4(y);
+ const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+ const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+ if (start_x >= max_x || start_y >= max_y) return true;
+ const int row = DivideBy4(start_y << subsampling_y);
+ const int column = DivideBy4(start_x << subsampling_x);
+ const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
+ const int sub_block_row4x4 = row & mask;
+ const int sub_block_column4x4 = column & mask;
+ const int step_x = kTransformWidth4x4[tx_size];
+ const int step_y = kTransformHeight4x4[tx_size];
+ const bool do_decode = mode == kProcessingModeDecodeOnly ||
+ mode == kProcessingModeParseAndDecode;
+ if (do_decode && !bp.is_inter) {
+ if (bp.prediction_parameters->palette_mode_info.size[GetPlaneType(plane)] >
+ 0) {
+ CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+ x, y, tx_size);
+ } else {
+ const PredictionMode mode =
+ (plane == kPlaneY) ? bp.y_mode
+ : (bp.prediction_parameters->uv_mode ==
+ kPredictionModeChromaFromLuma
+ ? kPredictionModeDc
+ : bp.prediction_parameters->uv_mode);
+ const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+ const int tr_column4x4 =
+ (sub_block_column4x4 >> subsampling_x) + step_x + 1;
+ const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
+ const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
+ const bool has_left = x > 0 || block.left_available[plane];
+ const bool has_top = y > 0 || block.top_available[plane];
+
+ CALL_BITDEPTH_FUNCTION(
+ IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+ block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+ block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+ mode, tx_size);
+ if (plane != kPlaneY &&
+ bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+ CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+ start_y, tx_size);
+ }
+ }
+ if (plane == kPlaneY) {
+ block.bp->prediction_parameters->max_luma_width =
+ start_x + MultiplyBy4(step_x);
+ block.bp->prediction_parameters->max_luma_height =
+ start_y + MultiplyBy4(step_y);
+ block.scratch_buffer->cfl_luma_buffer_valid = false;
+ }
+ }
+ if (!bp.skip) {
+ const int sb_row_index = SuperBlockRowIndex(block.row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
+ if (mode == kProcessingModeDecodeOnly) {
+ Queue<TransformParameters>& tx_params =
+ *residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->transform_parameters();
+ ReconstructBlock(block, plane, start_x, start_y, tx_size,
+ tx_params.Front().type,
+ tx_params.Front().non_zero_coeff_count);
+ tx_params.Pop();
+ } else {
+ TransformType tx_type;
+ int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
+ block, plane, start_x, start_y, tx_size, &tx_type);
+ } else // NOLINT
+#endif
+ {
+ non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+ block, plane, start_x, start_y, tx_size, &tx_type);
+ }
+ if (non_zero_coeff_count < 0) return false;
+ if (mode == kProcessingModeParseAndDecode) {
+ ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
+ non_zero_coeff_count);
+ } else {
+ assert(mode == kProcessingModeParseOnly);
+ residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->transform_parameters()
+ ->Push(TransformParameters(tx_type, non_zero_coeff_count));
+ }
+ }
+ }
+ if (do_decode) {
+ bool* block_decoded =
+ &block.scratch_buffer
+ ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
+ [(sub_block_column4x4 >> subsampling_x) + 1];
+ SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+ TileScratchBuffer::kBlockDecodedStride);
+ }
+ return true;
+}
+
+bool Tile::TransformTree(const Block& block, int start_x, int start_y,
+ BlockSize plane_size, ProcessingMode mode) {
+ assert(plane_size <= kBlock64x64);
+ // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
+ // required is (4 - 1) * 4 + 1 = 13.
+ Stack<TransformTreeNode, 13> stack;
+ // It is okay to cast BlockSize to TransformSize here since the enum are
+ // equivalent for all BlockSize values <= kBlock64x64.
+ stack.Push(TransformTreeNode(start_x, start_y,
+ static_cast<TransformSize>(plane_size)));
+
+ do {
+ TransformTreeNode node = stack.Pop();
+ const int row = DivideBy4(node.y);
+ const int column = DivideBy4(node.x);
+ if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
+ continue;
+ }
+ const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
+ const int width = kTransformWidth[node.tx_size];
+ const int height = kTransformHeight[node.tx_size];
+ if (width <= kTransformWidth[inter_tx_size] &&
+ height <= kTransformHeight[inter_tx_size]) {
+ if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
+ mode)) {
+ return false;
+ }
+ continue;
+ }
+ // The split transform size look up gives the right transform size that we
+ // should push in the stack.
+ // if (width > height) => transform size whose width is half.
+ // if (width < height) => transform size whose height is half.
+ // if (width == height) => transform size whose width and height are half.
+ const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
+ const int half_width = DivideBy2(width);
+ if (width > height) {
+ stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ continue;
+ }
+ const int half_height = DivideBy2(height);
+ if (width < height) {
+ stack.Push(
+ TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ continue;
+ }
+ stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
+ split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+ stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+ stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+ } while (!stack.Empty());
+ return true;
+}
+
+void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
+ int start_y, TransformSize tx_size,
+ TransformType tx_type, int non_zero_coeff_count) {
+ // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
+ assert(non_zero_coeff_count >= 0);
+ if (non_zero_coeff_count == 0) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (sequence_header_.color_config.bitdepth > 8) {
+ Array2DView<uint16_t> buffer(
+ buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+ reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+ Reconstruct(dsp_, tx_type, tx_size,
+ frame_header_.segmentation
+ .lossless[block.bp->prediction_parameters->segment_id],
+ reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
+ &buffer, non_zero_coeff_count);
+ } else // NOLINT
+#endif
+ {
+ Reconstruct(dsp_, tx_type, tx_size,
+ frame_header_.segmentation
+ .lossless[block.bp->prediction_parameters->segment_id],
+ reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+ &buffer_[plane], non_zero_coeff_count);
+ }
+ if (split_parse_and_decode_) {
+ *block.residual +=
+ kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
+ }
+}
+
+bool Tile::Residual(const Block& block, ProcessingMode mode) {
+ const int width_chunks = std::max(1, block.width >> 6);
+ const int height_chunks = std::max(1, block.height >> 6);
+ const BlockSize size_chunk4x4 =
+ (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
+ const BlockParameters& bp = *block.bp;
+ for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
+ for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ // For Y Plane, when lossless is true |bp.transform_size| is always
+ // kTransformSize4x4. So we can simply use |bp.transform_size| here as
+ // the Y plane's transform size (part of Section 5.11.37 in the spec).
+ const TransformSize tx_size =
+ (plane == kPlaneY)
+ ? inter_transform_sizes_[block.row4x4][block.column4x4]
+ : bp.uv_transform_size;
+ const BlockSize plane_size =
+ kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
+ assert(plane_size != kBlockInvalid);
+ if (bp.is_inter &&
+ !frame_header_.segmentation
+ .lossless[bp.prediction_parameters->segment_id] &&
+ plane == kPlaneY) {
+ const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
+ const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
+ const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
+ if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
+ return false;
+ }
+ } else {
+ const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+ const int step_x = kTransformWidth4x4[tx_size];
+ const int step_y = kTransformHeight4x4[tx_size];
+ const int num4x4_wide = kNum4x4BlocksWide[plane_size];
+ const int num4x4_high = kNum4x4BlocksHigh[plane_size];
+ for (int y = 0; y < num4x4_high; y += step_y) {
+ for (int x = 0; x < num4x4_wide; x += step_x) {
+ if (!TransformBlock(
+ block, static_cast<Plane>(plane), base_x, base_y, tx_size,
+ x + (MultiplyBy16(chunk_x) >> subsampling_x),
+ y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
+ return false;
+ }
+ }
+ }
+ }
+ } while (++plane < num_planes);
+ }
+ }
+ return true;
+}
+
+// The purpose of this function is to limit the maximum size of motion vectors
+// and also, if use_intra_block_copy is true, to additionally constrain the
+// motion vector so that the data is fetched from parts of the tile that have
+// already been decoded and are not too close to the current block (in order to
+// make a pipelined decoder implementation feasible).
+bool Tile::IsMvValid(const Block& block, bool is_compound) const {
+ const BlockParameters& bp = *block.bp;
+ for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
+ for (int mv_component : bp.mv.mv[i].mv) {
+ if (std::abs(mv_component) >= (1 << 14)) {
+ return false;
+ }
+ }
+ }
+ if (!block.bp->prediction_parameters->use_intra_block_copy) {
+ return true;
+ }
+ if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
+ return false;
+ }
+ const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+ const int delta_column = bp.mv.mv[0].mv[1] >> 3;
+ int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
+ int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
+ const int src_bottom_edge = src_top_edge + block.height;
+ const int src_right_edge = src_left_edge + block.width;
+ if (block.HasChroma()) {
+ if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
+ src_left_edge -= 4;
+ }
+ if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
+ src_top_edge -= 4;
+ }
+ }
+ if (src_top_edge < MultiplyBy4(row4x4_start_) ||
+ src_left_edge < MultiplyBy4(column4x4_start_) ||
+ src_bottom_edge > MultiplyBy4(row4x4_end_) ||
+ src_right_edge > MultiplyBy4(column4x4_end_)) {
+ return false;
+ }
+ // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
+ const int sb_height_log2 =
+ 6 + static_cast<int>(sequence_header_.use_128x128_superblock);
+ const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
+ const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
+ const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
+ const int src_64x64_block_column = (src_right_edge - 1) >> 6;
+ const int total_64x64_blocks_per_row =
+ ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
+ const int active_64x64_block =
+ active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
+ const int src_64x64_block =
+ src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
+ if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
+ return false;
+ }
+
+ // Wavefront constraint: use only top left area of frame for reference.
+ if (src_sb_row > active_sb_row) return false;
+ const int gradient =
+ 1 + kIntraBlockCopyDelay64x64Blocks +
+ static_cast<int>(sequence_header_.use_128x128_superblock);
+ const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
+ return src_64x64_block_column < active_64x64_block_column -
+ kIntraBlockCopyDelay64x64Blocks +
+ wavefront_offset;
+}
+
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+ int min[2];
+ int max[2];
+ GetClampParameters(block, min, max);
+ BlockParameters& bp = *block.bp;
+ const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ bp.mv.mv64 = 0;
+ if (is_compound) {
+ for (int i = 0; i < 2; ++i) {
+ const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+ MotionVector predicted_mv;
+ if (mode == kPredictionModeGlobalMv) {
+ predicted_mv = prediction_parameters.global_mv[i];
+ } else {
+ const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+ (mode == kPredictionModeNewMv &&
+ prediction_parameters.ref_mv_count <= 1))
+ ? 0
+ : prediction_parameters.ref_mv_index;
+ predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+ if (ref_mv_index < prediction_parameters.ref_mv_count) {
+ predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+ predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+ }
+ }
+ if (mode == kPredictionModeNewMv) {
+ ReadMotionVector(block, i);
+ bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+ bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+ } else {
+ bp.mv.mv[i] = predicted_mv;
+ }
+ }
+ } else {
+ const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+ MotionVector predicted_mv;
+ if (mode == kPredictionModeGlobalMv) {
+ predicted_mv = prediction_parameters.global_mv[0];
+ } else {
+ const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+ (mode == kPredictionModeNewMv &&
+ prediction_parameters.ref_mv_count <= 1))
+ ? 0
+ : prediction_parameters.ref_mv_index;
+ predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+ if (ref_mv_index < prediction_parameters.ref_mv_count) {
+ predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+ predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+ }
+ }
+ if (mode == kPredictionModeNewMv) {
+ ReadMotionVector(block, 0);
+ bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+ bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
+ } else {
+ bp.mv.mv[0] = predicted_mv;
+ }
+ }
+ return IsMvValid(block, is_compound);
+}
+
+bool Tile::AssignIntraMv(const Block& block) {
+ // TODO(linfengz): Check if the clamping process is necessary.
+ int min[2];
+ int max[2];
+ GetClampParameters(block, min, max);
+ BlockParameters& bp = *block.bp;
+ const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+ bp.mv.mv64 = 0;
+ ReadMotionVector(block, 0);
+ if (ref_mv_0.mv32 == 0) {
+ const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+ if (ref_mv_1.mv32 == 0) {
+ const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+ if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+ bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+ bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+ } else {
+ bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+ }
+ } else {
+ bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+ bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+ }
+ } else {
+ bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+ bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+ }
+ return IsMvValid(block, /*is_compound=*/false);
+}
+
+void Tile::ResetEntropyContext(const Block& block) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
+ const int subsampling_x = subsampling_x_[plane];
+ const int start_x = block.column4x4 >> subsampling_x;
+ const int end_x =
+ std::min((block.column4x4 + block.width4x4) >> subsampling_x,
+ frame_header_.columns4x4);
+ memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
+ end_x - start_x);
+ memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
+ end_x - start_x);
+ const int subsampling_y = subsampling_y_[plane];
+ const int start_y = block.row4x4 >> subsampling_y;
+ const int end_y =
+ std::min((block.row4x4 + block.height4x4) >> subsampling_y,
+ frame_header_.rows4x4);
+ memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
+ end_y - start_y);
+ memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
+ end_y - start_y);
+ } while (++plane < num_planes);
+}
+
+bool Tile::ComputePrediction(const Block& block) {
+ const BlockParameters& bp = *block.bp;
+ if (!bp.is_inter) return true;
+ const int mask =
+ (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
+ 1;
+ const int sub_block_row4x4 = block.row4x4 & mask;
+ const int sub_block_column4x4 = block.column4x4 & mask;
+ const int plane_count = block.HasChroma() ? PlaneCount() : 1;
+ // Returns true if this block applies local warping. The state is determined
+ // in the Y plane and carried for use in the U/V planes.
+ // But the U/V planes will not apply warping when the block size is smaller
+ // than 8x8, even if this variable is true.
+ bool is_local_valid = false;
+ // Local warping parameters, similar usage as is_local_valid.
+ GlobalMotion local_warp_params;
+ int plane = kPlaneY;
+ do {
+ const int8_t subsampling_x = subsampling_x_[plane];
+ const int8_t subsampling_y = subsampling_y_[plane];
+ const BlockSize plane_size = block.residual_size[plane];
+ const int block_width4x4 = kNum4x4BlocksWide[plane_size];
+ const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
+ const int block_width = MultiplyBy4(block_width4x4);
+ const int block_height = MultiplyBy4(block_height4x4);
+ const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+ const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+ if (bp.reference_frame[1] == kReferenceFrameIntra) {
+ const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
+ const int tr_column4x4 =
+ (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
+ const int bl_row4x4 =
+ (sub_block_row4x4 >> subsampling_y) + block_height4x4;
+ const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
+ const TransformSize tx_size =
+ k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
+ [k4x4HeightLog2[plane_size]];
+ const bool has_left = block.left_available[plane];
+ const bool has_top = block.top_available[plane];
+ CALL_BITDEPTH_FUNCTION(
+ IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+ has_left, has_top,
+ block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+ block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+ kInterIntraToIntraMode[block.bp->prediction_parameters
+ ->inter_intra_mode],
+ tx_size);
+ }
+ int candidate_row = block.row4x4;
+ int candidate_column = block.column4x4;
+ bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+ if (!some_use_intra && plane != 0) {
+ candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+ candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+ if (candidate_row != block.row4x4) {
+ // Top block.
+ const BlockParameters& bp_top =
+ *block_parameters_holder_.Find(candidate_row, block.column4x4);
+ some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+ if (!some_use_intra && candidate_column != block.column4x4) {
+ // Top-left block.
+ const BlockParameters& bp_top_left =
+ *block_parameters_holder_.Find(candidate_row, candidate_column);
+ some_use_intra =
+ bp_top_left.reference_frame[0] == kReferenceFrameIntra;
+ }
+ }
+ if (!some_use_intra && candidate_column != block.column4x4) {
+ // Left block.
+ const BlockParameters& bp_left =
+ *block_parameters_holder_.Find(block.row4x4, candidate_column);
+ some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
+ }
+ }
+ int prediction_width;
+ int prediction_height;
+ if (some_use_intra) {
+ candidate_row = block.row4x4;
+ candidate_column = block.column4x4;
+ prediction_width = block_width;
+ prediction_height = block_height;
+ } else {
+ prediction_width = block.width >> subsampling_x;
+ prediction_height = block.height >> subsampling_y;
+ }
+ int r = 0;
+ int y = 0;
+ do {
+ int c = 0;
+ int x = 0;
+ do {
+ if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+ base_y + y, prediction_width, prediction_height,
+ candidate_row + r, candidate_column + c,
+ &is_local_valid, &local_warp_params)) {
+ return false;
+ }
+ ++c;
+ x += prediction_width;
+ } while (x < block_width);
+ ++r;
+ y += prediction_height;
+ } while (y < block_height);
+ } while (++plane < plane_count);
+ return true;
+}
+
+#undef CALL_BITDEPTH_FUNCTION
+
+void Tile::PopulateDeblockFilterLevel(const Block& block) {
+ if (!post_filter_.DoDeblock()) return;
+ BlockParameters& bp = *block.bp;
+ const int mode_id =
+ static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
+ for (int i = 0; i < kFrameLfCount; ++i) {
+ if (delta_lf_all_zero_) {
+ bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
+ bp.prediction_parameters->segment_id, i, bp.reference_frame[0],
+ mode_id);
+ } else {
+ bp.deblock_filter_level[i] =
+ deblock_filter_levels_[bp.prediction_parameters->segment_id][i]
+ [bp.reference_frame[0]][mode_id];
+ }
+ }
+}
+
+void Tile::PopulateCdefSkip(const Block& block) {
+ if (!post_filter_.DoCdef() || block.bp->skip ||
+ (frame_header_.cdef.bits > 0 &&
+ cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)] ==
+ -1)) {
+ return;
+ }
+ // The rest of this function is an efficient version of the following code:
+ // for (int y = block.row4x4; y < block.row4x4 + block.height4x4; y++) {
+ // for (int x = block.column4x4; y < block.column4x4 + block.width4x4;
+ // x++) {
+ // const uint8_t mask = uint8_t{1} << ((x >> 1) & 0x7);
+ // cdef_skip_[y >> 1][x >> 4] |= mask;
+ // }
+ // }
+
+ // For all block widths other than 32, the mask will fit in uint8_t. For
+ // block width == 32, the mask is always 0xFFFF.
+ const int bw4 =
+ std::max(DivideBy2(block.width4x4) + (block.column4x4 & 1), 1);
+ const uint8_t mask = (block.width4x4 == 32)
+ ? 0xFF
+ : (uint8_t{0xFF} >> (8 - bw4))
+ << (DivideBy2(block.column4x4) & 0x7);
+ uint8_t* cdef_skip = &cdef_skip_[block.row4x4 >> 1][block.column4x4 >> 4];
+ const int stride = cdef_skip_.columns();
+ int row = 0;
+ do {
+ *cdef_skip |= mask;
+ if (block.width4x4 == 32) {
+ *(cdef_skip + 1) = 0xFF;
+ }
+ cdef_skip += stride;
+ row += 2;
+ } while (row < block.height4x4);
+}
+
+bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ // Do not process the block if the starting point is beyond the visible frame.
+ // This is equivalent to the has_row/has_column check in the
+ // decode_partition() section of the spec when partition equals
+ // kPartitionHorizontal or kPartitionVertical.
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ return true;
+ }
+
+ if (split_parse_and_decode_) {
+ // Push block ordering info to the queue. DecodeBlock() will use this queue
+ // to decode the blocks in the correct order.
+ const int sb_row_index = SuperBlockRowIndex(row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(column4x4);
+ residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order()
+ ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+ }
+
+ BlockParameters* bp_ptr =
+ block_parameters_holder_.Get(row4x4, column4x4, block_size);
+ if (bp_ptr == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+ return false;
+ }
+ BlockParameters& bp = *bp_ptr;
+ Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+ bp.size = block_size;
+ bp.prediction_parameters =
+ split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
+ new (std::nothrow) PredictionParameters())
+ : std::move(prediction_parameters_);
+ if (bp.prediction_parameters == nullptr) return false;
+ if (!DecodeModeInfo(block)) return false;
+ PopulateDeblockFilterLevel(block);
+ if (!ReadPaletteTokens(block)) return false;
+ DecodeTransformSize(block);
+ // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
+ bp.uv_transform_size =
+ frame_header_.segmentation.lossless[bp.prediction_parameters->segment_id]
+ ? kTransformSize4x4
+ : kUVTransformSize[block.residual_size[kPlaneU]];
+ if (bp.skip) ResetEntropyContext(block);
+ PopulateCdefSkip(block);
+ if (split_parse_and_decode_) {
+ if (!Residual(block, kProcessingModeParseOnly)) return false;
+ } else {
+ if (!ComputePrediction(block) ||
+ !Residual(block, kProcessingModeParseAndDecode)) {
+ return false;
+ }
+ }
+ // If frame_header_.segmentation.enabled is false,
+ // bp.prediction_parameters->segment_id is 0 for all blocks. We don't need to
+ // call save bp.prediction_parameters->segment_id in the current frame because
+ // the current frame's segmentation map will be cleared to all 0s.
+ //
+ // If frame_header_.segmentation.enabled is true and
+ // frame_header_.segmentation.update_map is false, we will copy the previous
+ // frame's segmentation map to the current frame. So we don't need to call
+ // save bp.prediction_parameters->segment_id in the current frame.
+ if (frame_header_.segmentation.enabled &&
+ frame_header_.segmentation.update_map) {
+ const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+ static_cast<int>(block.width4x4));
+ const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+ static_cast<int>(block.height4x4));
+ current_frame_.segmentation_map()->FillBlock(
+ row4x4, column4x4, x_limit, y_limit,
+ bp.prediction_parameters->segment_id);
+ }
+ StoreMotionFieldMvsIntoCurrentFrame(block);
+ if (!split_parse_and_decode_) {
+ prediction_parameters_ = std::move(bp.prediction_parameters);
+ }
+ return true;
+}
+
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ return true;
+ }
+ Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+ if (!ComputePrediction(block) ||
+ !Residual(block, kProcessingModeDecodeOnly)) {
+ return false;
+ }
+ block.bp->prediction_parameters.reset(nullptr);
+ return true;
+}
+
+bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
+ TileScratchBuffer* const scratch_buffer,
+ ResidualPtr* residual) {
+ Stack<PartitionTreeNode, kDfsStackSize> stack;
+
+ // Set up the first iteration.
+ stack.Push(
+ PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
+
+ // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
+ // Otherwise, the children are pushed into the stack for future processing.
+ do {
+ PartitionTreeNode node = stack.Pop();
+ int row4x4 = node.row4x4;
+ int column4x4 = node.column4x4;
+ BlockSize block_size = node.block_size;
+
+ if (row4x4 >= frame_header_.rows4x4 ||
+ column4x4 >= frame_header_.columns4x4) {
+ continue;
+ }
+ const int block_width4x4 = kNum4x4BlocksWide[block_size];
+ assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
+ const int half_block4x4 = block_width4x4 >> 1;
+ const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
+ const bool has_columns =
+ (column4x4 + half_block4x4) < frame_header_.columns4x4;
+ Partition partition;
+ if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
+ &partition)) {
+ LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ const BlockSize sub_size = kSubSize[partition][block_size];
+ // Section 6.10.4: It is a requirement of bitstream conformance that
+ // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
+ // every time subSize is computed.
+ if (sub_size == kBlockInvalid ||
+ kPlaneResidualSize[sub_size]
+ [sequence_header_.color_config.subsampling_x]
+ [sequence_header_.color_config.subsampling_y] ==
+ kBlockInvalid) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Invalid sub-block/plane size for row: %d column: %d partition: "
+ "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
+ row4x4, column4x4, partition, block_size, sub_size,
+ sequence_header_.color_config.subsampling_x,
+ sequence_header_.color_config.subsampling_y);
+ return false;
+ }
+
+ const int quarter_block4x4 = half_block4x4 >> 1;
+ const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+ assert(partition == kPartitionNone || sub_size != kBlockInvalid);
+ switch (partition) {
+ case kPartitionNone:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual)) {
+ return false;
+ }
+ break;
+ case kPartitionSplit:
+ // The children must be added in reverse order since a stack is being
+ // used.
+ stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+ column4x4 + half_block4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+ stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
+ break;
+ case kPartitionHorizontal:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionVertical:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionHorizontalWithTopSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionHorizontalWithBottomSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionVerticalWithLeftSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionVerticalWithRightSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
+ case kPartitionHorizontal4:
+ for (int i = 0; i < 4; ++i) {
+ if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ }
+ break;
+ case kPartitionVertical4:
+ for (int i = 0; i < 4; ++i) {
+ if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ }
+ break;
+ }
+ } while (!stack.Empty());
+ return true;
+}
+
+void Tile::ResetLoopRestorationParams() {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
+ kSgrProjDefaultMultiplier[i];
+ for (int j = 0; j < kNumWienerCoefficients; ++j) {
+ reference_unit_info_[plane].wiener_info.filter[i][j] =
+ kWienerDefaultFilter[j];
+ }
+ }
+ }
+}
+
+void Tile::ResetCdef(const int row4x4, const int column4x4) {
+ if (frame_header_.cdef.bits == 0) return;
+ const int row = DivideBy16(row4x4);
+ const int column = DivideBy16(column4x4);
+ cdef_index_[row][column] = -1;
+ if (sequence_header_.use_128x128_superblock) {
+ const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+ const int border_row = DivideBy16(row4x4 + cdef_size4x4);
+ const int border_column = DivideBy16(column4x4 + cdef_size4x4);
+ cdef_index_[row][border_column] = -1;
+ cdef_index_[border_row][column] = -1;
+ cdef_index_[border_row][border_column] = -1;
+ }
+}
+
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
+ int row4x4, int column4x4) {
+ // Set everything to false.
+ memset(scratch_buffer->block_decoded, 0,
+ sizeof(scratch_buffer->block_decoded));
+ // Set specific edge cases to true.
+ const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ const int subsampling_x = subsampling_x_[plane];
+ const int subsampling_y = subsampling_y_[plane];
+ const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
+ const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
+ // The memset is equivalent to the following lines in the spec:
+ // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
+ // if ( y < 0 && x < sbWidth4 ) {
+ // BlockDecoded[plane][y][x] = 1
+ // }
+ // }
+ const int num_elements =
+ std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
+ memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
+ // The for loop is equivalent to the following lines in the spec:
+ // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
+ // if ( x < 0 && y < sbHeight4 )
+ // BlockDecoded[plane][y][x] = 1
+ // }
+ // }
+ // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
+ for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
+ ++y) {
+ scratch_buffer->block_decoded[plane][y + 1][0] = true;
+ }
+ }
+}
+
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
+ TileScratchBuffer* const scratch_buffer,
+ ProcessingMode mode) {
+ const bool parsing =
+ mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
+ const bool decoding = mode == kProcessingModeDecodeOnly ||
+ mode == kProcessingModeParseAndDecode;
+ if (parsing) {
+ read_deltas_ = frame_header_.delta_q.present;
+ ResetCdef(row4x4, column4x4);
+ }
+ if (decoding) {
+ ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
+ }
+ const BlockSize block_size = SuperBlockSize();
+ if (parsing) {
+ ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
+ }
+ if (parsing && decoding) {
+ uint8_t* residual_buffer = residual_buffer_.get();
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
+ column4x4);
+ return false;
+ }
+ return true;
+ }
+ const int sb_row_index = SuperBlockRowIndex(row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(column4x4);
+ if (parsing) {
+ residual_buffer_threaded_[sb_row_index][sb_column_index] =
+ residual_buffer_pool_->Get();
+ if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
+ return false;
+ }
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
+ column4x4);
+ return false;
+ }
+ } else {
+ if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
+ row4x4, column4x4);
+ return false;
+ }
+ residual_buffer_pool_->Release(
+ std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
+ }
+ return true;
+}
+
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* const scratch_buffer) {
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ Queue<PartitionTreeNode>& partition_tree_order =
+ *residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order();
+ while (!partition_tree_order.Empty()) {
+ PartitionTreeNode block = partition_tree_order.Front();
+ if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+ scratch_buffer, &residual_buffer)) {
+ LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
+ block.row4x4, block.column4x4);
+ return false;
+ }
+ partition_tree_order.Pop();
+ }
+ return true;
+}
+
+void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+ BlockSize block_size) {
+ if (frame_header_.allow_intrabc) return;
+ LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
+ const bool is_superres_scaled =
+ frame_header_.width != frame_header_.upscaled_width;
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+ LoopRestorationUnitInfo unit_info;
+ if (restoration_info->PopulateUnitInfoForSuperBlock(
+ static_cast<Plane>(plane), block_size, is_superres_scaled,
+ frame_header_.superres_scale_denominator, row4x4, column4x4,
+ &unit_info)) {
+ for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
+ ++unit_row) {
+ for (int unit_column = unit_info.column_start;
+ unit_column < unit_info.column_end; ++unit_column) {
+ const int unit_id = unit_row * restoration_info->num_horizontal_units(
+ static_cast<Plane>(plane)) +
+ unit_column;
+ restoration_info->ReadUnitCoefficients(
+ &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
+ unit_id, &reference_unit_info_);
+ }
+ }
+ }
+ }
+}
+
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+ if (frame_header_.refresh_frame_flags == 0 ||
+ IsIntraFrame(frame_header_.frame_type)) {
+ return;
+ }
+ // Iterate over odd rows/columns beginning at the first odd row/column for the
+ // block. It is done this way because motion field mvs are only needed at a
+ // 8x8 granularity.
+ const int row_start4x4 = block.row4x4 | 1;
+ const int row_limit4x4 =
+ std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+ if (row_start4x4 >= row_limit4x4) return;
+ const int column_start4x4 = block.column4x4 | 1;
+ const int column_limit4x4 =
+ std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+ if (column_start4x4 >= column_limit4x4) return;
+
+ // The largest reference MV component that can be saved.
+ constexpr int kRefMvsLimit = (1 << 12) - 1;
+ const BlockParameters& bp = *block.bp;
+ ReferenceInfo* reference_info = current_frame_.reference_info();
+ for (int i = 1; i >= 0; --i) {
+ const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+ if (reference_frame_to_store <= kReferenceFrameIntra) continue;
+ // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+ // overlap between load and store.
+ const MotionVector mv_to_store = bp.mv.mv[i];
+ const int mv_row = std::abs(mv_to_store.mv[0]);
+ const int mv_column = std::abs(mv_to_store.mv[1]);
+ // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two absolute
+ // values and then compare with kRefMvsLimit to save a branch.
+ // The next line is equivalent to:
+ // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+ if ((mv_row | mv_column) <= kRefMvsLimit &&
+ reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+ const int row_start8x8 = DivideBy2(row_start4x4);
+ const int row_limit8x8 = DivideBy2(row_limit4x4);
+ const int column_start8x8 = DivideBy2(column_start4x4);
+ const int column_limit8x8 = DivideBy2(column_limit4x4);
+ const int rows = row_limit8x8 - row_start8x8;
+ const int columns = column_limit8x8 - column_start8x8;
+ const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+ ReferenceFrameType* const reference_frame_row_start =
+ &reference_info
+ ->motion_field_reference_frame[row_start8x8][column_start8x8];
+ MotionVector* const mv =
+ &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+ // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+ // and simplifies std::fill() for these cases.
+ if (columns <= 1) {
+ // Don't change the above condition to (columns == 1).
+ // Condition (columns <= 1) may help the compiler simplify the inlining
+ // of the general case of StoreMotionFieldMvs() by eliminating the
+ // (columns == 0) case.
+ assert(columns == 1);
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 1, reference_frame_row_start, mv);
+ } else if (columns == 2) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 2, reference_frame_row_start, mv);
+ } else if (columns == 4) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 4, reference_frame_row_start, mv);
+ } else if (columns == 8) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 8, reference_frame_row_start, mv);
+ } else if (columns == 16) {
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ 16, reference_frame_row_start, mv);
+ } else if (columns < 16) {
+ // This always true condition (columns < 16) may help the compiler
+ // simplify the inlining of the following function.
+ // This general case is rare and usually only happens to the blocks
+ // which contain the right boundary of the frame.
+ StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+ columns, reference_frame_row_start, mv);
+ } else {
+ assert(false);
+ }
+ return;
+ }
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile_scratch_buffer.h"
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+#if !LIBGAV1_CXX17
+// static
+constexpr int TileScratchBuffer::kBlockDecodedStride;
+#endif
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/dsp/constants.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+
+// Buffer to facilitate decoding a superblock.
+struct TileScratchBuffer : public MaxAlignedAllocable {
+ static constexpr int kBlockDecodedStride = 34;
+
+ LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ const int pixel_size = (bitdepth == 8) ? 1 : 2;
+#else
+ assert(bitdepth == 8);
+ static_cast<void>(bitdepth);
+ const int pixel_size = 1;
+#endif
+
+ static_assert(kConvolveScaleBorderRight >= kConvolveBorderRight, "");
+ constexpr int unaligned_convolve_buffer_stride =
+ kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop +
+ kConvolveScaleBorderRight;
+ convolve_block_buffer_stride = Align<ptrdiff_t>(
+ unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment);
+ constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels +
+ kConvolveBorderLeftTop +
+ kConvolveBorderBottom;
+
+ convolve_block_buffer = MakeAlignedUniquePtr<uint8_t>(
+ kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride);
+#if LIBGAV1_MSAN
+ // Quiet msan warnings in ConvolveScale2D_NEON(). Set with random non-zero
+ // value to aid in future debugging.
+ memset(convolve_block_buffer.get(), 0x66,
+ convolve_buffer_height * convolve_block_buffer_stride);
+#endif
+
+ return convolve_block_buffer != nullptr;
+ }
+
+ // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the
+ // prediction block size. This buffer is used to store that mask. The masks
+ // will be created for the Y plane and will be re-used for the U & V planes.
+ alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels];
+
+ // For each instance of the TileScratchBuffer, only one of the following
+ // buffers will be used at any given time, so it is ok to share them in a
+ // union.
+ union {
+ // Buffers used for prediction process.
+ // Compound prediction calculations always output 16-bit values. Depending
+ // on the bitdepth the values may be treated as int16_t or uint16_t. See
+ // src/dsp/convolve.cc and src/dsp/warp.cc for explanations.
+ // Inter/intra calculations output Pixel values.
+ // These buffers always use width as the stride. This enables packing the
+ // values in and simplifies loads/stores for small values.
+
+ // 10/12 bit compound prediction and 10/12 bit inter/intra prediction.
+ alignas(kMaxAlignment) uint16_t
+ prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
+ // 8 bit compound prediction buffer.
+ alignas(kMaxAlignment) int16_t
+ compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels];
+
+ // Union usage note: This is used only by functions in the "intra"
+ // prediction path.
+ //
+ // Buffer used for storing subsampled luma samples needed for CFL
+ // prediction. This buffer is used to avoid repetition of the subsampling
+ // for the V plane when it is already done for the U plane.
+ int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+ };
+
+ // Buffer used for convolve. The maximum size required for this buffer is:
+ // maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263.
+ // maximum block stride (with scaling and border aligned to 16) =
+ // (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size.
+ // Where pixel_size is (bitdepth == 8) ? 1 : 2.
+ // Has an alignment of kMaxAlignment when allocated.
+ AlignedUniquePtr<uint8_t> convolve_block_buffer;
+ ptrdiff_t convolve_block_buffer_stride;
+
+ // Flag indicating whether the data in |cfl_luma_buffer| is valid.
+ bool cfl_luma_buffer_valid;
+
+ // Equivalent to BlockDecoded array in the spec. This stores the decoded
+ // state of every 4x4 block in a superblock. It has 1 row/column border on
+ // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
+ // spec uses "-1" as an index to access the left and top borders. In the
+ // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
+ // all accesses into this array will be offset by +1 when compared with the
+ // spec.
+ bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
+};
+
+class TileScratchBufferPool {
+ public:
+ void Reset(int bitdepth) {
+ if (bitdepth_ == bitdepth) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ == 8 && bitdepth != 8) {
+ // We are going from a pixel size of 1 to a pixel size of 2. So invalidate
+ // the stack.
+ std::lock_guard<std::mutex> lock(mutex_);
+ while (!buffers_.Empty()) {
+ buffers_.Pop();
+ }
+ }
+#endif
+ bitdepth_ = bitdepth;
+ }
+
+ std::unique_ptr<TileScratchBuffer> Get() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (buffers_.Empty()) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer(new (std::nothrow)
+ TileScratchBuffer);
+ if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) {
+ return nullptr;
+ }
+ return scratch_buffer;
+ }
+ return buffers_.Pop();
+ }
+
+ void Release(std::unique_ptr<TileScratchBuffer> scratch_buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ buffers_.Push(std::move(scratch_buffer));
+ }
+
+ private:
+ std::mutex mutex_;
+ // We will never need more than kMaxThreads scratch buffers since that is the
+ // maximum amount of work that will be done at any given time.
+ Stack<std::unique_ptr<TileScratchBuffer>, kMaxThreads> buffers_
+ LIBGAV1_GUARDED_BY(mutex_);
+ int bitdepth_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Exposes a 1D allocated memory buffer as a 2D array.
+template <typename T>
+class Array2DView {
+ public:
+ Array2DView() = default;
+ Array2DView(int rows, int columns, T* const data) {
+ Reset(rows, columns, data);
+ }
+
+ // Copyable and Movable.
+ Array2DView(const Array2DView& rhs) = default;
+ Array2DView& operator=(const Array2DView& rhs) = default;
+
+ void Reset(int rows, int columns, T* const data) {
+ rows_ = rows;
+ columns_ = columns;
+ data_ = data;
+ }
+
+ int rows() const { return rows_; }
+ int columns() const { return columns_; }
+
+ T* operator[](int row) { return const_cast<T*>(GetRow(row)); }
+
+ const T* operator[](int row) const { return GetRow(row); }
+
+ private:
+ const T* GetRow(int row) const {
+ assert(row < rows_);
+ const ptrdiff_t offset = static_cast<ptrdiff_t>(row) * columns_;
+ return data_ + offset;
+ }
+
+ int rows_ = 0;
+ int columns_ = 0;
+ T* data_ = nullptr;
+};
+
+// Allocates and owns the contiguous memory and exposes an Array2DView of
+// dimension |rows| x |columns|.
+template <typename T>
+class Array2D {
+ public:
+ Array2D() = default;
+
+ // Copyable and Movable.
+ Array2D(const Array2D& rhs) = default;
+ Array2D& operator=(const Array2D& rhs) = default;
+
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns,
+ bool zero_initialize = true) {
+ size_ = rows * columns;
+ // If T is not a trivial type, we should always reallocate the data_
+ // buffer, so that the destructors of any existing objects are invoked.
+ if (!std::is_trivial<T>::value || allocated_size_ < size_) {
+ // Note: This invokes the global operator new if T is a non-class type,
+ // such as integer or enum types, or a class type that is not derived
+ // from libgav1::Allocable, such as std::unique_ptr. If we enforce a
+ // maximum allocation size or keep track of our own heap memory
+ // consumption, we will need to handle the allocations here that use the
+ // global operator new.
+ if (zero_initialize) {
+ data_.reset(new (std::nothrow) T[size_]());
+ } else {
+ data_.reset(new (std::nothrow) T[size_]);
+ }
+ if (data_ == nullptr) {
+ allocated_size_ = 0;
+ return false;
+ }
+ allocated_size_ = size_;
+ } else if (zero_initialize) {
+ // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess
+ // warning. The memset is safe because T is a trivial type.
+ void* dest = data_.get();
+ memset(dest, 0, sizeof(T) * size_);
+ }
+ data_view_.Reset(rows, columns, data_.get());
+ return true;
+ }
+
+ int rows() const { return data_view_.rows(); }
+ int columns() const { return data_view_.columns(); }
+ size_t size() const { return size_; }
+ T* data() { return data_.get(); }
+ const T* data() const { return data_.get(); }
+
+ T* operator[](int row) { return data_view_[row]; }
+
+ const T* operator[](int row) const { return data_view_[row]; }
+
+ private:
+ std::unique_ptr<T[]> data_;
+ size_t allocated_size_ = 0;
+ size_t size_ = 0;
+ Array2DView<T> data_view_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_ARRAY_2D_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/array_2d.h"
+
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kRows = 50;
+constexpr int kColumns = 200;
+
+TEST(Array2dViewTest, TestUint8) {
+ uint8_t data[kRows * kColumns] = {};
+ Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+ // Verify data.
+ data[kColumns] = 100;
+ data[kColumns + 1] = 101;
+ data[kColumns * 2 + 10] = 210;
+ data[kColumns * 2 + 40] = 240;
+ EXPECT_EQ(data2d[1][0], 100);
+ EXPECT_EQ(data2d[1][1], 101);
+ EXPECT_EQ(data2d[2][10], 210);
+ EXPECT_EQ(data2d[2][40], 240);
+
+ // Verify pointers.
+ EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint16) {
+ uint16_t data[kRows * kColumns] = {};
+ Array2DView<uint16_t> data2d(kRows, kColumns, data);
+
+ // Verify data.
+ data[kColumns] = 100;
+ data[kColumns + 1] = 101;
+ data[kColumns * 2 + 10] = 210;
+ data[kColumns * 2 + 40] = 240;
+ EXPECT_EQ(data2d[1][0], 100);
+ EXPECT_EQ(data2d[1][1], 101);
+ EXPECT_EQ(data2d[2][10], 210);
+ EXPECT_EQ(data2d[2][40], 240);
+
+ // Verify pointers.
+ EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint8Const) {
+ uint8_t data[kRows * kColumns] = {};
+ // Declared as const to provide a read-only view of |data|.
+ const Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+ // Verify data.
+ data[kColumns] = 100;
+ data[kColumns + 1] = 101;
+ data[kColumns * 2 + 10] = 210;
+ data[kColumns * 2 + 40] = 240;
+ EXPECT_EQ(data2d[1][0], 100);
+ EXPECT_EQ(data2d[1][1], 101);
+ EXPECT_EQ(data2d[2][10], 210);
+ EXPECT_EQ(data2d[2][40], 240);
+
+ // Verify pointers.
+ EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dTest, TestUint8) {
+ Array2D<uint8_t> data2d;
+ ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+ EXPECT_EQ(data2d.rows(), kRows);
+ EXPECT_EQ(data2d.columns(), kColumns);
+
+ // Verify pointers.
+ for (int i = 0; i < kRows; ++i) {
+ EXPECT_NE(data2d[i], nullptr);
+ }
+
+ // Verify data (must be zero initialized).
+ for (int i = 0; i < kRows; ++i) {
+ for (int j = 0; j < kColumns; ++j) {
+ EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+ }
+ }
+
+ // Reset to a 2d array of smaller size with zero_initialize == false.
+ data2d[0][0] = 10;
+ ASSERT_TRUE(data2d.Reset(kRows - 1, kColumns - 1, false));
+
+ EXPECT_EQ(data2d.rows(), kRows - 1);
+ EXPECT_EQ(data2d.columns(), kColumns - 1);
+
+ // Verify pointers.
+ for (int i = 0; i < kRows - 1; ++i) {
+ EXPECT_NE(data2d[i], nullptr);
+ }
+
+ // Verify data (must be zero except for 0,0 because it was zero initialized in
+ // the previous call to Reset).
+ for (int i = 0; i < kRows - 1; ++i) {
+ for (int j = 0; j < kColumns - 1; ++j) {
+ if (i == 0 && j == 0) {
+ EXPECT_EQ(data2d[i][j], 10) << "Mismatch in [" << i << "][" << j << "]";
+ } else {
+ EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+ }
+ }
+ }
+
+ // Reset to a 2d array of smaller size with zero_initialize == true.
+ ASSERT_TRUE(data2d.Reset(kRows - 2, kColumns - 2, true));
+
+ EXPECT_EQ(data2d.rows(), kRows - 2);
+ EXPECT_EQ(data2d.columns(), kColumns - 2);
+
+ // Verify pointers.
+ for (int i = 0; i < kRows - 2; ++i) {
+ EXPECT_NE(data2d[i], nullptr);
+ }
+
+ // Verify data (must be zero initialized).
+ for (int i = 0; i < kRows - 2; ++i) {
+ for (int j = 0; j < kColumns - 2; ++j) {
+ EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+ }
+ }
+}
+
+TEST(Array2dTest, TestUniquePtr1) {
+ // A simple class that sets an int value to 0 in the destructor.
+ class Cleaner {
+ public:
+ explicit Cleaner(int* value) : value_(value) {}
+ ~Cleaner() { *value_ = 0; }
+
+ private:
+ int* value_;
+ };
+ int value = 100;
+ Array2D<std::unique_ptr<Cleaner>> data2d;
+ ASSERT_TRUE(data2d.Reset(4, 4, true));
+ data2d[0][0].reset(new (std::nothrow) Cleaner(&value));
+ EXPECT_EQ(value, 100);
+ // Reset to a smaller size. Depending on the implementation, the data_ buffer
+ // may or may not be reused.
+ ASSERT_TRUE(data2d.Reset(2, 2, true));
+ // Reset to a much larger size. The data_ buffer will be reallocated.
+ ASSERT_TRUE(data2d.Reset(32, 32, true));
+ // The destructors of all elements in the former data_ buffer should have
+ // been invoked.
+ EXPECT_EQ(value, 0);
+}
+
+TEST(Array2dTest, TestUniquePtr2) {
+ // A simple class that sets an int value to 0 in the destructor.
+ class Cleaner {
+ public:
+ explicit Cleaner(int* value) : value_(value) {}
+ ~Cleaner() { *value_ = 0; }
+
+ private:
+ int* value_;
+ };
+ int value1 = 100;
+ int value2 = 200;
+ Array2D<std::unique_ptr<Cleaner>> data2d;
+ ASSERT_TRUE(data2d.Reset(4, 4, false));
+ data2d[0][0].reset(new (std::nothrow) Cleaner(&value1));
+ data2d[3][3].reset(new (std::nothrow) Cleaner(&value2));
+ EXPECT_EQ(value1, 100);
+ EXPECT_EQ(value2, 200);
+ // Reset to a smaller size. Whether or not the data_ buffer is reused, the
+ // destructors of all existing elements should be invoked.
+ ASSERT_TRUE(data2d.Reset(2, 2, false));
+ EXPECT_EQ(value1, 0);
+ EXPECT_EQ(value2, 0);
+}
+
+// Shows that std::is_standard_layout is not relevant to the default
+// initialization vs. value initialization issue, but std::is_trivial is.
+TEST(Array2dTest, TestStructInit) {
+ // Make one data member private so that this struct does not have a standard
+ // layout. This also makes the struct not a POD type.
+ struct Point {
+ int x;
+ int Y() const { return y; }
+
+ private:
+ int y;
+ };
+
+ EXPECT_TRUE(std::is_trivial<Point>::value);
+ EXPECT_FALSE(std::is_standard_layout<Point>::value);
+
+ // The Point structs in this array are default initialized.
+ Array2D<Point> data2d_default_init;
+ ASSERT_TRUE(data2d_default_init.Reset(kRows, kColumns, false));
+ // The Point structs in this array are value initialized (i.e., zero
+ // initialized).
+ Array2D<Point> data2d;
+ ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+#if LIBGAV1_MSAN
+ // Use MemorySanitizer to check Reset(rows, columns, false) does not
+ // initialize the memory while Reset(rows, columns, true) does.
+ //
+ // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+ // first (at least partially) poisoned byte in the range, or -1 if the whole
+ // range is good.
+ for (int i = 0; i < kRows; ++i) {
+ EXPECT_EQ(__msan_test_shadow(data2d_default_init[i],
+ sizeof(data2d_default_init[0][0]) * kColumns),
+ 0);
+ EXPECT_EQ(__msan_test_shadow(data2d[i], sizeof(data2d[0][0]) * kColumns),
+ -1);
+ for (int j = 0; j < kColumns; ++j) {
+ EXPECT_EQ(data2d[i][j].x, 0);
+ EXPECT_EQ(data2d[i][j].Y(), 0);
+ }
+ }
+#endif
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// This class is used to check if a given value is equal to one of the several
+// predetermined values using a bit mask instead of a chain of comparisons and
+// ||s. This usually results in fewer instructions.
+//
+// Usage:
+// constexpr BitMaskSet set(value1, value2);
+// set.Contains(value1) => returns true.
+// set.Contains(value3) => returns false.
+class BitMaskSet {
+ public:
+ explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {}
+
+ constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+ int v8, int v9)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {}
+
+ constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+ int v8, int v9, int v10)
+ : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+ (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) {
+ }
+
+ constexpr bool Contains(uint8_t value) const {
+ return MaskContainsValue(mask_, value);
+ }
+
+ static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) {
+ return ((mask >> value) & 1) != 0;
+ }
+
+ private:
+ const uint32_t mask_;
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/bit_reader.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+bool Assign(int* const value, int assignment, bool return_value) {
+ *value = assignment;
+ return return_value;
+}
+
+// 5.9.29.
+int InverseRecenter(int r, int v) {
+ if (v > (r << 1)) {
+ return v;
+ }
+ if ((v & 1) != 0) {
+ return r - ((v + 1) >> 1);
+ }
+ return r + (v >> 1);
+}
+
+} // namespace
+
+bool BitReader::DecodeSignedSubexpWithReference(int low, int high,
+ int reference, int control,
+ int* const value) {
+ if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control,
+ value)) {
+ return false;
+ }
+ *value += low;
+ return true;
+}
+
+bool BitReader::DecodeUniform(int n, int* const value) {
+ if (n <= 1) {
+ return Assign(value, 0, true);
+ }
+ const int w = FloorLog2(n) + 1;
+ const int m = (1 << w) - n;
+ assert(w - 1 < 32);
+ const int v = static_cast<int>(ReadLiteral(w - 1));
+ if (v == -1) {
+ return Assign(value, 0, false);
+ }
+ if (v < m) {
+ return Assign(value, v, true);
+ }
+ const int extra_bit = ReadBit();
+ if (extra_bit == -1) {
+ return Assign(value, 0, false);
+ }
+ return Assign(value, (v << 1) - m + extra_bit, true);
+}
+
+bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference,
+ int control,
+ int* const value) {
+ int v;
+ if (!DecodeSubexp(mx, control, &v)) return false;
+ if ((reference << 1) <= mx) {
+ *value = InverseRecenter(reference, v);
+ } else {
+ *value = mx - 1 - InverseRecenter(mx - 1 - reference, v);
+ }
+ return true;
+}
+
+bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) {
+ int i = 0;
+ int mk = 0;
+ while (true) {
+ const int b = (i != 0) ? control + i - 1 : control;
+ if (b >= 32) {
+ return Assign(value, 0, false);
+ }
+ const int a = 1 << b;
+ if (num_symbols <= mk + 3 * a) {
+ if (!DecodeUniform(num_symbols - mk, value)) return false;
+ *value += mk;
+ return true;
+ }
+ const int8_t subexp_more_bits = ReadBit();
+ if (subexp_more_bits == -1) return false;
+ if (subexp_more_bits != 0) {
+ ++i;
+ mk += a;
+ } else {
+ const int subexp_bits = static_cast<int>(ReadLiteral(b));
+ if (subexp_bits == -1) {
+ return Assign(value, 0, false);
+ }
+ return Assign(value, subexp_bits + mk, true);
+ }
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_BIT_READER_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+class BitReader {
+ public:
+ virtual ~BitReader() = default;
+
+ virtual int ReadBit() = 0;
+ // |num_bits| has to be <= 32. The function returns a value in the range [0,
+ // 2^num_bits - 1] (inclusive) on success and -1 on failure.
+ virtual int64_t ReadLiteral(int num_bits) = 0;
+
+ bool DecodeSignedSubexpWithReference(int low, int high, int reference,
+ int control, int* value); // 5.9.26.
+ // Decodes a nonnegative integer with maximum number of values |n| (i.e.,
+ // output in range 0..n-1) by following the process specified in Section
+ // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec.
+ bool DecodeUniform(int n, int* value);
+
+ private:
+ // Helper functions for DecodeSignedSubexpWithReference.
+ bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control,
+ int* value); // 5.9.27.
+ bool DecodeSubexp(int num_symbols, int control, int* value); // 5.9.28.
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BIT_READER_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include <algorithm>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
+ rows4x4_ = rows4x4;
+ columns4x4_ = columns4x4;
+ index_ = 0;
+ return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+ block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+ BlockSize block_size) {
+ const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+ if (index >= block_parameters_.size()) return nullptr;
+ auto& bp = block_parameters_.get()[index];
+ if (bp == nullptr) {
+ bp.reset(new (std::nothrow) BlockParameters);
+ if (bp == nullptr) return nullptr;
+ }
+ FillCache(row4x4, column4x4, block_size, bp.get());
+ return bp.get();
+}
+
+void BlockParametersHolder::FillCache(int row4x4, int column4x4,
+ BlockSize block_size,
+ BlockParameters* const bp) {
+ int rows = std::min(static_cast<int>(kNum4x4BlocksHigh[block_size]),
+ rows4x4_ - row4x4);
+ const int columns = std::min(static_cast<int>(kNum4x4BlocksWide[block_size]),
+ columns4x4_ - column4x4);
+ auto* bp_dst = &block_parameters_cache_[row4x4][column4x4];
+ // Specialize columns cases (values in kNum4x4BlocksWide[]) for better
+ // performance.
+ if (columns == 1) {
+ SetBlock<BlockParameters*>(rows, 1, bp, bp_dst, columns4x4_);
+ } else if (columns == 2) {
+ SetBlock<BlockParameters*>(rows, 2, bp, bp_dst, columns4x4_);
+ } else if (columns == 4) {
+ SetBlock<BlockParameters*>(rows, 4, bp, bp_dst, columns4x4_);
+ } else if (columns == 8) {
+ SetBlock<BlockParameters*>(rows, 8, bp, bp_dst, columns4x4_);
+ } else if (columns == 16) {
+ SetBlock<BlockParameters*>(rows, 16, bp, bp_dst, columns4x4_);
+ } else if (columns == 32) {
+ SetBlock<BlockParameters*>(rows, 32, bp, bp_dst, columns4x4_);
+ } else {
+ do {
+ // The following loop has better performance than using std::fill().
+ // std::fill() has some overhead in checking zero loop count.
+ int x = columns;
+ auto* d = bp_dst;
+ do {
+ *d++ = bp;
+ } while (--x != 0);
+ bp_dst += columns4x4_;
+ } while (--rows != 0);
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+
+#include <atomic>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
+class BlockParametersHolder {
+ public:
+ BlockParametersHolder() = default;
+
+ // Not copyable or movable.
+ BlockParametersHolder(const BlockParametersHolder&) = delete;
+ BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
+
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+ // Returns a pointer to a BlockParameters object that can be used safely until
+ // the next call to Reset(). Returns nullptr on memory allocation failure. It
+ // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+ // of size |block_size| with the returned pointer.
+ BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
+
+ // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
+ // is done as a simple look up of the |block_parameters_cache_| matrix.
+ // Returns nullptr if the BlockParameters cannot be found.
+ BlockParameters* Find(int row4x4, int column4x4) const {
+ return block_parameters_cache_[row4x4][column4x4];
+ }
+
+ BlockParameters** Address(int row4x4, int column4x4) {
+ return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+ }
+
+ BlockParameters* const* Address(int row4x4, int column4x4) const {
+ return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+ }
+
+ int columns4x4() const { return columns4x4_; }
+
+ private:
+ // Needs access to FillCache for testing Cdef.
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterApplyCdefTest;
+
+ void FillCache(int row4x4, int column4x4, BlockSize block_size,
+ BlockParameters* bp);
+
+ int rows4x4_ = 0;
+ int columns4x4_ = 0;
+
+ // Owns the memory of BlockParameters pointers for the entire frame. It can
+ // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+ // on demand and re-used across frames.
+ DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+ // Points to the next available index of |block_parameters_|.
+ std::atomic<int> index_;
+
+ // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
+ // FillCache() and used by Find() to perform look ups using exactly one look
+ // up (instead of traversing the entire tree).
+ Array2D<BlockParameters*> block_parameters_cache_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BlockParametersHolder, TestBasic) {
+ BlockParametersHolder holder;
+ ASSERT_TRUE(holder.Reset(20, 20));
+
+ // Get a BlockParameters object.
+ BlockParameters* const bp1 = holder.Get(10, 10, kBlock32x32);
+ ASSERT_NE(bp1, nullptr);
+ // Ensure that cache was filled appropriately. From (10, 10) to (17, 17)
+ // should be bp1 (10 + 4x4 width/height of 32x32 block is 18).
+ for (int i = 10; i < 18; ++i) {
+ for (int j = 10; j < 18; ++j) {
+ EXPECT_EQ(holder.Find(i, j), bp1)
+ << "Mismatch in (" << i << ", " << j << ")";
+ }
+ }
+
+ // Get the maximum number of BlockParameters objects.
+ for (int i = 0; i < 399; ++i) {
+ EXPECT_NE(holder.Get(10, 10, kBlock32x32), nullptr)
+ << "Mismatch in index " << i;
+ }
+
+ // Get() should now return nullptr since there are no more BlockParameters
+ // objects available.
+ EXPECT_EQ(holder.Get(10, 10, kBlock32x32), nullptr);
+
+ // Reset the holder to the same size.
+ ASSERT_TRUE(holder.Reset(20, 20));
+
+ // Get a BlockParameters object. This should be the same as bp1 since the
+ // holder was Reset to the same size.
+ BlockParameters* const bp2 = holder.Get(10, 10, kBlock32x32);
+ EXPECT_EQ(bp2, bp1);
+
+ // Reset the holder to a smaller size.
+ ASSERT_TRUE(holder.Reset(20, 10));
+
+ // Get a BlockParameters object. This should be the same as bp1 since the
+ // holder was Reset to a smaller size.
+ BlockParameters* const bp3 = holder.Get(0, 0, kBlock32x32);
+ EXPECT_EQ(bp3, bp1);
+
+ // Reset the holder to a larger size.
+ ASSERT_TRUE(holder.Reset(30, 30));
+
+ // Get a BlockParameters object. This may or may not be the same as bp1 since
+ // the holder was Reset to a larger size.
+ BlockParameters* const bp4 = holder.Get(0, 0, kBlock32x32);
+ EXPECT_NE(bp4, nullptr);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+
+#include <cassert>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <mutex> // NOLINT (unapproved c++11 header)
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Implementation of a Blocking Counter that is used for the "fork-join"
+// use case. Typical usage would be as follows:
+// BlockingCounter counter(num_jobs);
+// - spawn the jobs.
+// - call counter.Wait() on the master thread.
+// - worker threads will call counter.Decrement().
+// - master thread will return from counter.Wait() when all workers are
+// complete.
+template <bool has_failure_status>
+class BlockingCounterImpl {
+ public:
+ explicit BlockingCounterImpl(int initial_count)
+ : count_(initial_count), job_failed_(false) {}
+
+ // Increment the counter by |count|. This must be called before Wait() is
+ // called. This must be called from the same thread that will call Wait().
+ void IncrementBy(int count) {
+ assert(count >= 0);
+ std::unique_lock<std::mutex> lock(mutex_);
+ count_ += count;
+ }
+
+ // Decrement the counter by 1. This function can be called only when
+ // |has_failure_status| is false (i.e.) when this class is being used with the
+ // |BlockingCounter| alias.
+ void Decrement() {
+ static_assert(!has_failure_status, "");
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (--count_ == 0) {
+ condition_.notify_one();
+ }
+ }
+
+ // Decrement the counter by 1. This function can be called only when
+ // |has_failure_status| is true (i.e.) when this class is being used with the
+ // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the
+ // state of |job_failed_|.
+ void Decrement(bool job_succeeded) {
+ static_assert(has_failure_status, "");
+ std::unique_lock<std::mutex> lock(mutex_);
+ job_failed_ |= !job_succeeded;
+ if (--count_ == 0) {
+ condition_.notify_one();
+ }
+ }
+
+ // Block until the counter becomes 0. This function can be called only once
+ // per object. If |has_failure_status| is true, true is returned if all the
+ // jobs succeeded and false is returned if any of the jobs failed. If
+ // |has_failure_status| is false, this function always returns true.
+ bool Wait() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ condition_.wait(lock, [this]() { return count_ == 0; });
+ // If |has_failure_status| is false, we simply return true.
+ return has_failure_status ? !job_failed_ : true;
+ }
+
+ private:
+ std::mutex mutex_;
+ std::condition_variable condition_;
+ int count_ LIBGAV1_GUARDED_BY(mutex_);
+ bool job_failed_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+using BlockingCounterWithStatus = BlockingCounterImpl<true>;
+using BlockingCounter = BlockingCounterImpl<false>;
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/blocking_counter.h"
+
+#include <array>
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kNumWorkers = 10;
+constexpr int kNumJobs = 20;
+
+TEST(BlockingCounterTest, BasicFunctionality) {
+ std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+ BlockingCounter counter(kNumJobs);
+ std::array<bool, kNumJobs> done = {};
+
+ // Schedule the jobs.
+ for (int i = 0; i < kNumJobs; ++i) {
+ pool->Schedule([&counter, &done, i]() {
+ absl::SleepFor(absl::Seconds(1));
+ done[i] = true;
+ counter.Decrement();
+ });
+ }
+
+ // Wait for the jobs to complete. This should always return true.
+ ASSERT_TRUE(counter.Wait());
+
+ // Make sure the jobs were actually complete.
+ for (const auto& job_done : done) {
+ EXPECT_TRUE(job_done);
+ }
+}
+
+TEST(BlockingCounterTest, IncrementBy) {
+ std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+ BlockingCounter counter(0);
+ std::array<bool, kNumJobs> done = {};
+
+ // Schedule the jobs.
+ for (int i = 0; i < kNumJobs; ++i) {
+ counter.IncrementBy(1);
+ pool->Schedule([&counter, &done, i]() {
+ absl::SleepFor(absl::Seconds(1));
+ done[i] = true;
+ counter.Decrement();
+ });
+ }
+
+ // Wait for the jobs to complete. This should always return true.
+ ASSERT_TRUE(counter.Wait());
+
+ // Make sure the jobs were actually complete.
+ for (const auto& job_done : done) {
+ EXPECT_TRUE(job_done);
+ }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionality) {
+ std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+ BlockingCounterWithStatus counter(kNumJobs);
+ std::array<bool, kNumJobs> done = {};
+
+ // Schedule the jobs.
+ for (int i = 0; i < kNumJobs; ++i) {
+ pool->Schedule([&counter, &done, i]() {
+ absl::SleepFor(absl::Seconds(1));
+ done[i] = true;
+ counter.Decrement(true);
+ });
+ }
+
+ // Wait for the jobs to complete. This should return true since all the jobs
+ // reported |job_succeeded| as true.
+ ASSERT_TRUE(counter.Wait());
+
+ // Make sure the jobs were actually complete.
+ for (const auto& job_done : done) {
+ EXPECT_TRUE(job_done);
+ }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionalityWithStatus) {
+ std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+ BlockingCounterWithStatus counter(kNumJobs);
+ std::array<bool, kNumJobs> done = {};
+
+ // Schedule the jobs.
+ for (int i = 0; i < kNumJobs; ++i) {
+ pool->Schedule([&counter, &done, i]() {
+ absl::SleepFor(absl::Seconds(1));
+ done[i] = true;
+ counter.Decrement(i != 10);
+ });
+ }
+
+ // Wait for the jobs to complete. This should return false since one of the
+ // jobs reported |job_succeeded| as false.
+ ASSERT_FALSE(counter.Wait());
+
+ // Make sure the jobs were actually complete.
+ for (const auto& job_done : done) {
+ EXPECT_TRUE(job_done);
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMMON_H_
+#define LIBGAV1_SRC_UTILS_COMMON_H_
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#if defined(_M_X64) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#define HAVE_BITSCANREVERSE64
+#endif // defined(_M_X64) || defined(_M_ARM64)
+#endif // defined(_MSC_VER)
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// LIBGAV1_RESTRICT
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+// Note a template alias is not used for compatibility with older compilers
+// (e.g., gcc < 10) that do not expand the type when instantiating a template
+// function, either explicitly or in an assignment to a function pointer as is
+// done within the dsp code. RestrictPtr<T>::type is an alternative to this,
+// similar to std::add_const, but for conciseness the macro is preferred.
+#ifdef __GNUC__
+#define LIBGAV1_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define LIBGAV1_RESTRICT __restrict
+#else
+#define LIBGAV1_RESTRICT
+#endif
+
+// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2.
+template <typename T>
+inline T Align(T value, T alignment) {
+ assert(alignment != 0);
+ const T alignment_mask = alignment - 1;
+ return (value + alignment_mask) & ~alignment_mask;
+}
+
+// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2.
+inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) {
+ const auto value = reinterpret_cast<uintptr_t>(addr);
+ return reinterpret_cast<uint8_t*>(Align(value, alignment));
+}
+
+inline int32_t Clip3(int32_t value, int32_t low, int32_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+ const int right) {
+ auto* const start = static_cast<Pixel*>(line_start);
+ const Pixel* src = start;
+ Pixel* dst = start - left;
+ // Copy to left and right borders.
+ Memset(dst, src[0], left);
+ Memset(dst + left + width, src[width - 1], right);
+}
+
+// The following 2 templates set a block of data with uncontiguous memory to
+// |value|. The compilers usually generate several branches to handle different
+// cases of |columns| when inlining memset() and std::fill(), and these branches
+// are unfortunately within the loop of |rows|. So calling these templates
+// directly could be inefficient. It is recommended to specialize common cases
+// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before
+// processing the generic case of |columns|. The code size may be larger, but
+// there would be big speed gains.
+// Call template MemSetBlock<> when sizeof(|T|) is 1.
+// Call template SetBlock<> when sizeof(|T|) is larger than 1.
+template <typename T>
+void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ static_assert(sizeof(T) == 1, "");
+ do {
+ memset(dst, value, columns);
+ dst += stride;
+ } while (--rows != 0);
+}
+
+template <typename T>
+void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+ do {
+ std::fill(dst, dst + columns, value);
+ dst += stride;
+ } while (--rows != 0);
+}
+
+#if defined(__GNUC__)
+
+inline int CountLeadingZeros(uint32_t n) {
+ assert(n != 0);
+ return __builtin_clz(n);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+ assert(n != 0);
+ return __builtin_clzll(n);
+}
+
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ return __builtin_ctz(n);
+}
+
+#elif defined(_MSC_VER)
+
+inline int CountLeadingZeros(uint32_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+ const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 31 ^ static_cast<int>(first_set_bit);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+#if defined(HAVE_BITSCANREVERSE64)
+ const unsigned char bit_set =
+ _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
+#else // !defined(HAVE_BITSCANREVERSE64)
+ const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int)
+ if (n_hi != 0) {
+ const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 31 ^ static_cast<int>(first_set_bit);
+ }
+ const unsigned char bit_set = _BitScanReverse(
+ &first_set_bit, static_cast<unsigned long>(n)); // NOLINT(runtime/int)
+#endif // defined(HAVE_BITSCANREVERSE64)
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return 63 ^ static_cast<int>(first_set_bit);
+}
+
+#undef HAVE_BITSCANREVERSE64
+
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+ const unsigned char bit_set = _BitScanForward(&first_set_bit, n);
+ assert(bit_set != 0);
+ static_cast<void>(bit_set);
+ return static_cast<int>(first_set_bit);
+}
+
+#else // !defined(__GNUC__) && !defined(_MSC_VER)
+
+template <const int kMSB, typename T>
+inline int CountLeadingZeros(T n) {
+ assert(n != 0);
+ const T msb = T{1} << kMSB;
+ int count = 0;
+ while ((n & msb) == 0) {
+ ++count;
+ n <<= 1;
+ }
+ return count;
+}
+
+inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); }
+
+inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); }
+
+// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second
+// Edition, page 109. The book says:
+// If the number of trailing 0's is expected to be small or large, then the
+// simple loops shown in Figure 5-23 are quite fast.
+inline int CountTrailingZeros(uint32_t n) {
+ assert(n != 0);
+ // Create a word with 1's at the positions of the trailing 0's in |n|, and
+ // 0's elsewhere (e.g., 01011000 => 00000111).
+ n = ~n & (n - 1);
+ int count = 0;
+ while (n != 0) {
+ ++count;
+ n >>= 1;
+ }
+ return count;
+}
+
+#endif // defined(__GNUC__)
+
+inline int FloorLog2(int32_t n) {
+ assert(n > 0);
+ return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline int FloorLog2(uint32_t n) {
+ assert(n > 0);
+ return 31 ^ CountLeadingZeros(n);
+}
+
+inline int FloorLog2(int64_t n) {
+ assert(n > 0);
+ return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
+}
+
+inline int FloorLog2(uint64_t n) {
+ assert(n > 0);
+ return 63 ^ CountLeadingZeros(n);
+}
+
+inline int CeilLog2(unsigned int n) {
+ // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but
+ // also for n == 1, so this expression must be guarded by the n < 2 test. An
+ // alternative implementation is:
+ // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0);
+ return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
+}
+
+inline int RightShiftWithCeiling(int value, int bits) {
+ assert(bits > 0);
+ return (value + (1 << bits) - 1) >> bits;
+}
+
+inline int32_t RightShiftWithRounding(int32_t value, int bits) {
+ assert(bits >= 0);
+ return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+inline uint32_t RightShiftWithRounding(uint32_t value, int bits) {
+ assert(bits >= 0);
+ return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRounding(int64_t value, int bits) {
+ assert(bits >= 0);
+ return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits);
+}
+
+inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) {
+ assert(bits > 0);
+ // The next line is equivalent to:
+ // return (value >= 0) ? RightShiftWithRounding(value, bits)
+ // : -RightShiftWithRounding(-value, bits);
+ return RightShiftWithRounding(value + (value >> 31), bits);
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) {
+ assert(bits > 0);
+ // The next line is equivalent to:
+ // return (value >= 0) ? RightShiftWithRounding(value, bits)
+ // : -RightShiftWithRounding(-value, bits);
+ return RightShiftWithRounding(value + (value >> 63), bits);
+}
+
+constexpr int DivideBy2(int n) { return n >> 1; }
+constexpr int DivideBy4(int n) { return n >> 2; }
+constexpr int DivideBy8(int n) { return n >> 3; }
+constexpr int DivideBy16(int n) { return n >> 4; }
+constexpr int DivideBy32(int n) { return n >> 5; }
+constexpr int DivideBy64(int n) { return n >> 6; }
+constexpr int DivideBy128(int n) { return n >> 7; }
+
+// Convert |value| to unsigned before shifting to avoid undefined behavior with
+// negative values.
+inline int LeftShift(int value, int bits) {
+ assert(bits >= 0);
+ assert(value >= -(int64_t{1} << (31 - bits)));
+ assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0));
+ return static_cast<int>(static_cast<uint32_t>(value) << bits);
+}
+inline int MultiplyBy2(int n) { return LeftShift(n, 1); }
+inline int MultiplyBy4(int n) { return LeftShift(n, 2); }
+inline int MultiplyBy8(int n) { return LeftShift(n, 3); }
+inline int MultiplyBy16(int n) { return LeftShift(n, 4); }
+inline int MultiplyBy32(int n) { return LeftShift(n, 5); }
+inline int MultiplyBy64(int n) { return LeftShift(n, 6); }
+
+constexpr int Mod32(int n) { return n & 0x1f; }
+constexpr int Mod64(int n) { return n & 0x3f; }
+
+//------------------------------------------------------------------------------
+// Bitstream functions
+
+constexpr bool IsIntraFrame(FrameType type) {
+ return type == kFrameKey || type == kFrameIntraOnly;
+}
+
+inline TransformClass GetTransformClass(TransformType tx_type) {
+ constexpr BitMaskSet kTransformClassVerticalMask(
+ kTransformTypeIdentityDct, kTransformTypeIdentityAdst,
+ kTransformTypeIdentityFlipadst);
+ if (kTransformClassVerticalMask.Contains(tx_type)) {
+ return kTransformClassVertical;
+ }
+ constexpr BitMaskSet kTransformClassHorizontalMask(
+ kTransformTypeDctIdentity, kTransformTypeAdstIdentity,
+ kTransformTypeFlipadstIdentity);
+ if (kTransformClassHorizontalMask.Contains(tx_type)) {
+ return kTransformClassHorizontal;
+ }
+ return kTransformClass2D;
+}
+
+inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane,
+ int8_t subsampling) {
+ return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling);
+}
+
+constexpr PlaneType GetPlaneType(Plane plane) {
+ return static_cast<PlaneType>(plane != kPlaneY);
+}
+
+// 5.11.44.
+constexpr bool IsDirectionalMode(PredictionMode mode) {
+ return mode >= kPredictionModeVertical && mode <= kPredictionModeD67;
+}
+
+// 5.9.3.
+//
+// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit
+// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32.
+// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a
+// value between 24 and 31 (inclusive).
+//
+// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the
+// result is zero. If |order_hint_shift_bits| is not zero, returns the
+// signed difference |a| - |b| using "modular arithmetic". More precisely, the
+// signed difference |a| - |b| is treated as a signed order_hint_bits-bit
+// integer and cast to an int. The returned difference is between
+// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1
+// (inclusive).
+//
+// NOTE: |a| and |b| are the order_hint_bits least significant bits of the
+// actual values. This function returns the signed difference between the
+// actual values. The returned difference is correct as long as the actual
+// values are not more than 1 << (order_hint_bits - 1) - 1 apart.
+//
+// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits|
+// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for
+// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and
+// |b| are exactly 8 apart, this function cannot tell whether the actual value
+// for |a| is before or after the actual value for |b|.)
+//
+// First, consider the order hints 2 and 6. For this simple case, we have
+// GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and
+// GetRelativeDistance(6, 2, 28) = 6 - 2 = 4.
+//
+// On the other hand, consider the order hints 2 and 14. The order hints are
+// 12 (> 7) apart, so we need to use the actual values instead. The actual
+// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore
+// we have
+// GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and
+// GetRelativeDistance(14, 2, 28) = 30 - 34 = -4.
+//
+// The following comments apply only to specific CPUs' SIMD implementations,
+// such as intrinsics code.
+// For the 2 shift operations in this function, if the SIMD packed data is
+// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to
+// shift; If the SIMD packed data is 8-bit wide, try to use
+// |order_hint_shift_bits| - 24 as as the number of bits to shift.
+// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or
+// -24. In these cases diff is 0, and the behavior of left or right shifting -16
+// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions,
+// and the result of shifting 0 is still 0. There is no guarantee that this
+// behavior and result apply to other CPUs' SIMD instructions.
+inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
+ const unsigned int order_hint_shift_bits) {
+ const int diff = static_cast<int>(a) - static_cast<int>(b);
+ assert(order_hint_shift_bits <= 31);
+ if (order_hint_shift_bits == 0) {
+ assert(a == 0);
+ assert(b == 0);
+ } else {
+ assert(order_hint_shift_bits >= 24); // i.e., order_hint_bits <= 8
+ assert(a < (1u << (32 - order_hint_shift_bits)));
+ assert(b < (1u << (32 - order_hint_shift_bits)));
+ assert(diff < (1 << (32 - order_hint_shift_bits)));
+ assert(diff >= -(1 << (32 - order_hint_shift_bits)));
+ }
+ // Sign extend the result of subtracting the values.
+ // Cast to unsigned int and then left shift to avoid undefined behavior with
+ // negative values. Cast to int to do the sign extension through right shift.
+ // This requires the right shift of a signed integer be an arithmetic shift,
+ // which is true for clang, gcc, and Visual C++.
+ // These two casts do not generate extra instructions.
+ // Don't use LeftShift(diff) since a valid diff may fail its assertions.
+ // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less
+ // than the minimum allowed value of LeftShift() which is -8.
+ // The next 3 lines are equivalent to:
+ // const int order_hint_bits = Mod32(32 - order_hint_shift_bits);
+ // const int m = (1 << order_hint_bits) >> 1;
+ // return (diff & (m - 1)) - (diff & m);
+ return static_cast<int>(static_cast<unsigned int>(diff)
+ << order_hint_shift_bits) >>
+ order_hint_shift_bits;
+}
+
+// Applies |sign| (must be 0 or -1) to |value|, i.e.,
+// return (sign == 0) ? value : -value;
+// and does so without a branch.
+constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
+
+// 7.9.3. (without the clamp for numerator and denominator).
+inline void GetMvProjection(const MotionVector& mv, int numerator,
+ int division_multiplier,
+ MotionVector* projection_mv) {
+ // Allow numerator and to be 0 so that this function can be called
+ // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+ // is what we want.
+ assert(std::abs(numerator) <= kMaxFrameDistance);
+ for (int i = 0; i < 2; ++i) {
+ projection_mv->mv[i] =
+ Clip3(RightShiftWithRoundingSigned(
+ mv.mv[i] * numerator * division_multiplier, 14),
+ -kProjectionMvClamp, kProjectionMvClamp);
+ }
+}
+
+// 7.9.4.
+constexpr int Project(int value, int delta, int dst_sign) {
+ return value + ApplySign(delta / 64, dst_sign);
+}
+
+inline bool IsBlockSmallerThan8x8(BlockSize size) {
+ return size < kBlock8x8 && size != kBlock4x16;
+}
+
+// Returns true if the either the width or the height of the block is equal to
+// four.
+inline bool IsBlockDimension4(BlockSize size) {
+ return size < kBlock8x8 || size == kBlock16x4;
+}
+
+// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively.
+constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; }
+
+// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps
+// to 0, kTransformSize8x8 maps to 1 and so on.
+inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) {
+ assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]);
+
+ // The values of the square transform sizes happen to be in the right
+ // ranges, so we can just divide them by 4 to get the indexes.
+ static_assert(
+ std::is_unsigned<std::underlying_type<TransformSize>::type>::value, "");
+ static_assert(kTransformSize4x4 < 4, "");
+ static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, "");
+ static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, "");
+ static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, "");
+ static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, "");
+ return DivideBy4(tx_size);
+}
+
+// Gets the corresponding Y/U/V position, to set and get filter masks
+// in deblock filtering.
+// Returns luma_position if it's Y plane, whose subsampling must be 0.
+// Returns the odd position for U/V plane, if there is subsampling.
+constexpr int GetDeblockPosition(const int luma_position,
+ const int subsampling) {
+ return luma_position | subsampling;
+}
+
+// Returns the size of the residual buffer required to hold the residual values
+// for a block or frame of size |rows| by |columns| (taking into account
+// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the
+// number of bytes required to represent one residual value.
+inline size_t GetResidualBufferSize(const int rows, const int columns,
+ const int subsampling_x,
+ const int subsampling_y,
+ const size_t residual_size) {
+ // The subsampling multipliers are:
+ // Both x and y are subsampled: 3 / 2.
+ // Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2).
+ // Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2).
+ // So we compute the final subsampling multiplier as follows:
+ // multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2.
+ // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks
+ // when parsing quantized coefficients.
+ const int subsampling_multiplier_num =
+ 2 + (4 >> subsampling_x >> subsampling_y);
+ const int number_elements =
+ (rows * columns * subsampling_multiplier_num) >> 1;
+ const int tx_padding = 32 * kResidualPaddingVertical;
+ return residual_size * (number_elements + tx_padding);
+}
+
+// This function is equivalent to:
+// std::min({kTransformWidthLog2[tx_size] - 2,
+// kTransformWidthLog2[left_tx_size] - 2,
+// 2});
+constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth(
+ TransformSize tx_size, TransformSize left_tx_size) {
+ return static_cast<LoopFilterTransformSizeId>(
+ static_cast<int>(tx_size > kTransformSize4x16 &&
+ left_tx_size > kTransformSize4x16) +
+ static_cast<int>(tx_size > kTransformSize8x32 &&
+ left_tx_size > kTransformSize8x32));
+}
+
+// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve
+// filters.
+inline int GetFilterIndex(const int filter_index, const int length) {
+ if (length <= 4) {
+ if (filter_index == kInterpolationFilterEightTap ||
+ filter_index == kInterpolationFilterEightTapSharp) {
+ return 4;
+ }
+ if (filter_index == kInterpolationFilterEightTapSmooth) {
+ return 5;
+ }
+ }
+ return filter_index;
+}
+
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
+constexpr int SubsampledValue(int value, int subsampling) {
+ return (value + subsampling) >> subsampling;
+}
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_COMMON_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/common.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int BitLength(int64_t n) {
+ int count = 0;
+ while (n != 0) {
+ ++count;
+ n >>= 1;
+ }
+ return count;
+}
+
+TEST(CommonUtilsTest, Align) {
+ for (int i = 0; i <= 8; ++i) {
+ const int alignment = 1 << i;
+ SCOPED_TRACE("alignment: " + std::to_string(alignment));
+ EXPECT_EQ(Align(0, alignment), 0);
+ EXPECT_EQ(Align(1, alignment), alignment);
+ EXPECT_EQ(Align(alignment + 1, alignment), 2 * alignment);
+ if (i > 1) {
+ EXPECT_EQ(Align(alignment - 1, alignment), alignment);
+ EXPECT_EQ(Align(2 * alignment - 1, alignment), 2 * alignment);
+ }
+ }
+}
+
+TEST(CommonUtilsTest, AlignAddr) {
+ auto buf = MakeAlignedUniquePtr<uint8_t>(/*alignment=*/1024, 512);
+ ASSERT_NE(buf, nullptr);
+ auto* const bufptr = buf.get();
+ ASSERT_EQ(reinterpret_cast<uintptr_t>(bufptr) % 1024, 0);
+
+ for (int i = 0; i <= 8; ++i) {
+ const int alignment = 1 << i;
+ ASSERT_LE(alignment, 1024);
+ SCOPED_TRACE("alignment: " + std::to_string(alignment));
+ EXPECT_EQ(AlignAddr(nullptr, alignment), nullptr);
+ EXPECT_EQ(AlignAddr(bufptr, alignment), bufptr);
+ EXPECT_EQ(AlignAddr(bufptr + 1, alignment), bufptr + alignment);
+ EXPECT_EQ(AlignAddr(bufptr + alignment + 1, alignment),
+ bufptr + 2 * alignment);
+ if (i > 1) {
+ EXPECT_EQ(AlignAddr(bufptr + alignment - 1, alignment),
+ bufptr + alignment);
+ EXPECT_EQ(AlignAddr(bufptr + 2 * alignment - 1, alignment),
+ bufptr + 2 * alignment);
+ }
+ }
+}
+
+TEST(CommonUtilsTest, Clip3) {
+ // Value <= lower boundary.
+ EXPECT_EQ(Clip3(10, 20, 30), 20);
+ EXPECT_EQ(Clip3(20, 20, 30), 20);
+ // Value >= higher boundary.
+ EXPECT_EQ(Clip3(40, 20, 30), 30);
+ EXPECT_EQ(Clip3(30, 20, 30), 30);
+ // Value within boundary.
+ EXPECT_EQ(Clip3(25, 20, 30), 25);
+ // Clipping based on bitdepth (clamp between 0 and 2^bitdepth - 1). Make sure
+ // that the resulting values are always in the pixel range for the
+ // corresponding bitdepth.
+ static constexpr int bitdepths[] = {8, 10, 12};
+ static constexpr int pixels[] = {100, 500, 5000, -100, -500, -5000};
+ for (const auto& bitdepth : bitdepths) {
+ for (const auto& pixel : pixels) {
+ const int clipped_pixel = Clip3(pixel, 0, (1 << bitdepth) - 1);
+ EXPECT_GE(clipped_pixel, 0)
+ << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+ EXPECT_LE(clipped_pixel, (1 << bitdepth) - 1)
+ << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+ }
+ }
+}
+
+template <typename Pixel>
+void TestExtendLine(int width, const int left, int right, Pixel left_value,
+ Pixel right_value) {
+ constexpr int size = 1000;
+ ASSERT_LE(width + left + right, size);
+ Pixel line[size];
+ Pixel* line_start = line + left;
+ line_start[0] = left_value;
+ line_start[width - 1] = right_value;
+ ExtendLine<Pixel>(line_start, width, left, right);
+ for (int x = 0; x < left; x++) {
+ EXPECT_EQ(left_value, line[x]) << "Left side mismatch at x: " << x;
+ }
+ for (int x = 0; x < right; x++) {
+ EXPECT_EQ(right_value, line[left + width + x])
+ << "Right side mismatch at x: " << x;
+ }
+}
+
+TEST(CommonUtilsTest, ExtendLine) {
+ TestExtendLine<uint8_t>(300, 0, 0, 31, 13);
+ TestExtendLine<uint8_t>(100, 10, 20, 31, 13);
+ TestExtendLine<uint8_t>(257, 31, 77, 59, 255);
+ TestExtendLine<uint16_t>(600, 0, 0, 1234, 4321);
+ TestExtendLine<uint16_t>(200, 55, 88, 12345, 54321);
+ TestExtendLine<uint16_t>(2, 99, 333, 257, 513);
+}
+
+template <typename T>
+void TestMemSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+ constexpr int size = 1000;
+ T block[size];
+ static_assert(sizeof(T) == 1, "");
+ ASSERT_LE(rows * stride, size);
+ ASSERT_LE(columns, stride);
+ MemSetBlock<T>(rows, columns, value, block, stride);
+ for (int y = 0; y < rows; y++) {
+ for (int x = 0; x < columns; x++) {
+ EXPECT_EQ(value, block[y * stride + x])
+ << "Mismatch at y: " << y << " x: " << x;
+ }
+ }
+}
+
+TEST(CommonUtilsTest, MemSetBlock) {
+ TestMemSetBlock<bool>(15, 28, 29, true);
+ TestMemSetBlock<bool>(17, 1, 24, false);
+ TestMemSetBlock<bool>(7, 2, 13, true);
+ TestMemSetBlock<int8_t>(35, 17, 19, 123);
+ TestMemSetBlock<uint8_t>(19, 16, 16, 234);
+}
+
+template <typename T>
+void TestSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+ constexpr int size = 1000;
+ T block[size];
+ ASSERT_LE(rows * stride, size);
+ ASSERT_LE(columns, stride);
+ SetBlock<T>(rows, columns, value, block, stride);
+ for (int y = 0; y < rows; y++) {
+ for (int x = 0; x < columns; x++) {
+ EXPECT_EQ(value, block[y * stride + x])
+ << "Mismatch at y: " << y << " x: " << x;
+ }
+ }
+}
+
+TEST(CommonUtilsTest, SetBlock) {
+ // Test 1-byte block set.
+ TestSetBlock<bool>(15, 28, 29, true);
+ TestSetBlock<bool>(17, 1, 24, false);
+ TestSetBlock<bool>(7, 2, 13, true);
+ TestSetBlock<int8_t>(35, 17, 19, 123);
+ TestSetBlock<uint8_t>(19, 16, 16, 234);
+ // Test 2-byte block set.
+ TestSetBlock<int16_t>(23, 27, 28, 1234);
+ TestSetBlock<uint16_t>(13, 39, 44, 4321);
+ // Test 4-byte block set.
+ TestSetBlock<int>(14, 7, 7, 12345);
+ TestSetBlock<int>(33, 4, 15, 54321);
+ // Test pointer block set.
+ int data;
+ TestSetBlock<int*>(23, 8, 25, &data);
+}
+
+TEST(CommonUtilsTest, CountTrailingZeros) {
+ EXPECT_EQ(CountTrailingZeros(0x1), 0);
+ EXPECT_EQ(CountTrailingZeros(0x3), 0);
+ EXPECT_EQ(CountTrailingZeros(0x7), 0);
+ EXPECT_EQ(CountTrailingZeros(0xF), 0);
+ EXPECT_EQ(CountTrailingZeros(0x2), 1);
+ EXPECT_EQ(CountTrailingZeros(0x6), 1);
+ EXPECT_EQ(CountTrailingZeros(0xE), 1);
+ EXPECT_EQ(CountTrailingZeros(0x4), 2);
+ EXPECT_EQ(CountTrailingZeros(0xC), 2);
+ EXPECT_EQ(CountTrailingZeros(0x8), 3);
+ EXPECT_EQ(CountTrailingZeros(0x10), 4);
+ EXPECT_EQ(CountTrailingZeros(0x30), 4);
+ EXPECT_EQ(CountTrailingZeros(0x70), 4);
+ EXPECT_EQ(CountTrailingZeros(0xF0), 4);
+ EXPECT_EQ(CountTrailingZeros(0x20), 5);
+ EXPECT_EQ(CountTrailingZeros(0x60), 5);
+ EXPECT_EQ(CountTrailingZeros(0xE0), 5);
+ EXPECT_EQ(CountTrailingZeros(0x40), 6);
+ EXPECT_EQ(CountTrailingZeros(0xC0), 6);
+ EXPECT_EQ(CountTrailingZeros(0x80), 7);
+ EXPECT_EQ(CountTrailingZeros(0x31), 0);
+ EXPECT_EQ(CountTrailingZeros(0x32), 1);
+ EXPECT_EQ(CountTrailingZeros(0x34), 2);
+ EXPECT_EQ(CountTrailingZeros(0x38), 3);
+ EXPECT_EQ(CountTrailingZeros(0x310), 4);
+ EXPECT_EQ(CountTrailingZeros(0x320), 5);
+ EXPECT_EQ(CountTrailingZeros(0x340), 6);
+ EXPECT_EQ(CountTrailingZeros(0x380), 7);
+}
+
+TEST(CommonUtilsTest, FloorLog2) {
+ // Powers of 2.
+ EXPECT_EQ(FloorLog2(1), 0);
+ EXPECT_EQ(FloorLog2(2), 1);
+ EXPECT_EQ(FloorLog2(8), 3);
+ EXPECT_EQ(FloorLog2(64), 6);
+ // Powers of 2 +/- 1.
+ EXPECT_EQ(FloorLog2(9), 3);
+ EXPECT_EQ(FloorLog2(15), 3);
+ EXPECT_EQ(FloorLog2(63), 5);
+ // Large value, smaller than 32 bit.
+ EXPECT_EQ(FloorLog2(0x7fffffff), 30);
+ EXPECT_EQ(FloorLog2(0x80000000), 31);
+ // Larger than 32 bit.
+ EXPECT_EQ(FloorLog2(uint64_t{0x7fffffffffffffff}), 62);
+ EXPECT_EQ(FloorLog2(uint64_t{0x8000000000000000}), 63);
+ EXPECT_EQ(FloorLog2(uint64_t{0xffffffffffffffff}), 63);
+}
+
+TEST(CommonUtilsTest, CeilLog2) {
+ // Even though log2(0) is -inf, here we explicitly define it to be 0.
+ EXPECT_EQ(CeilLog2(0), 0);
+ // Powers of 2.
+ EXPECT_EQ(CeilLog2(1), 0);
+ EXPECT_EQ(CeilLog2(2), 1);
+ EXPECT_EQ(CeilLog2(8), 3);
+ EXPECT_EQ(CeilLog2(64), 6);
+ // Powers of 2 +/- 1.
+ EXPECT_EQ(CeilLog2(9), 4);
+ EXPECT_EQ(CeilLog2(15), 4);
+ EXPECT_EQ(CeilLog2(63), 6);
+ // Large value.
+ EXPECT_EQ(CeilLog2(0x7fffffff), 31);
+}
+
+TEST(CommonUtilsTest, RightShiftWithCeiling) {
+ // Shift 1 bit.
+ EXPECT_EQ(RightShiftWithCeiling(1, 1), 1);
+ EXPECT_EQ(RightShiftWithCeiling(2, 1), 1);
+ EXPECT_EQ(RightShiftWithCeiling(3, 1), 2);
+ EXPECT_EQ(RightShiftWithCeiling(4, 1), 2);
+ EXPECT_EQ(RightShiftWithCeiling(5, 1), 3);
+ // Shift 2 bits.
+ EXPECT_EQ(RightShiftWithCeiling(1, 2), 1);
+ EXPECT_EQ(RightShiftWithCeiling(2, 2), 1);
+ EXPECT_EQ(RightShiftWithCeiling(3, 2), 1);
+ EXPECT_EQ(RightShiftWithCeiling(4, 2), 1);
+ EXPECT_EQ(RightShiftWithCeiling(5, 2), 2);
+ // Shift 20 bits.
+ EXPECT_EQ(RightShiftWithCeiling(1, 20), 1);
+ EXPECT_EQ(RightShiftWithCeiling((1 << 20) - 1, 20), 1);
+ EXPECT_EQ(RightShiftWithCeiling(1 << 20, 20), 1);
+ EXPECT_EQ(RightShiftWithCeiling((1 << 20) + 1, 20), 2);
+ EXPECT_EQ(RightShiftWithCeiling((1 << 21) - 1, 20), 2);
+}
+
+template <typename Input, typename Output>
+void VerifyRightShiftWithRounding(const Input* const values,
+ const int* const bits,
+ const Output* const rounded_values,
+ size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ const Output rounded_value = RightShiftWithRounding(values[i], bits[i]);
+ EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+ // Rounding reduces the bit length by |bits[i]| - 1.
+ EXPECT_LE(BitLength(rounded_value), BitLength(values[i]) - (bits[i] - 1))
+ << "Mismatch at index " << i;
+ }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt32) {
+ static constexpr int32_t values[] = {5, 203, 204, 255, 40000, 50000};
+ static constexpr int bits[] = {0, 3, 3, 3, 12, 12};
+ static constexpr int32_t rounded_values[] = {5, 25, 26, 32, 10, 12};
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+ VerifyRightShiftWithRounding<int32_t, int32_t>(values, bits, rounded_values,
+ ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingUint32) {
+ static constexpr uint32_t values[] = {5, 203, 204, 255,
+ 40000, 50000, 0x7fffffff};
+ static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20};
+ static constexpr uint32_t rounded_values[] = {5, 25, 26, 32, 10, 12, 2048};
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+ VerifyRightShiftWithRounding<uint32_t, uint32_t>(values, bits, rounded_values,
+ ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt64) {
+ static constexpr int64_t values[] = {5, 203, 204, 255,
+ 40000, 50000, 0x7fffffff, 0x8fffffff};
+ static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20, 20};
+ static constexpr int32_t rounded_values[] = {5, 25, 26, 32,
+ 10, 12, 2048, 2304};
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+ VerifyRightShiftWithRounding<int64_t, int32_t>(values, bits, rounded_values,
+ ABSL_ARRAYSIZE(values));
+}
+
+template <typename Input>
+void VerifyRightShiftWithRoundingSigned(const Input* const values,
+ const int* const bits,
+ const int32_t* const rounded_values,
+ int count) {
+ for (int i = 0; i < count; ++i) {
+ int32_t rounded_value = RightShiftWithRoundingSigned(values[i], bits[i]);
+ EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+ rounded_value = RightShiftWithRoundingSigned(-values[i], bits[i]);
+ EXPECT_EQ(rounded_value, -rounded_values[i]) << "Mismatch at index " << i;
+ }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt32) {
+ static constexpr int32_t values[] = {203, 204, 255, 40000, 50000};
+ static constexpr int bits[] = {3, 3, 3, 12, 12};
+ static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12};
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+ VerifyRightShiftWithRoundingSigned<int32_t>(values, bits, rounded_values,
+ ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt64) {
+ static constexpr int64_t values[] = {203, 204, 255, 40000,
+ 50000, 0x7fffffff, 0x8fffffff};
+ static constexpr int bits[] = {3, 3, 3, 12, 12, 20, 20};
+ static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12, 2048, 2304};
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+ static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+ VerifyRightShiftWithRoundingSigned<int64_t>(values, bits, rounded_values,
+ ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, GetResidualBufferSize) {
+ // No subsampling.
+ EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 0, 2),
+ /* 2*(64*64*3/1 + 32*4) = */ 24832);
+ // Only X is subsampled.
+ EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 0, 2),
+ /* 2*(64*64*2/1 + 32*4) = */ 16640);
+ // Only Y is subsampled.
+ EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 1, 2),
+ /* 2*(64*64*2/1 + 32*4) = */ 16640);
+ // Both X and Y are subsampled.
+ EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 1, 2),
+ /* 2*(64*64*3/2 + 32*4) = */ 12544);
+}
+
+//------------------------------------------------------------------------------
+// Tests for bitstream util functions
+
+TEST(BitstreamUtilTest, IsIntraFrame) {
+ EXPECT_TRUE(IsIntraFrame(kFrameKey));
+ EXPECT_TRUE(IsIntraFrame(kFrameIntraOnly));
+ EXPECT_FALSE(IsIntraFrame(kFrameInter));
+ EXPECT_FALSE(IsIntraFrame(kFrameSwitch));
+}
+
+TEST(BitstreamUtilTest, GetTransformClass) {
+ static constexpr TransformClass expected_classes[kNumTransformTypes] = {
+ kTransformClass2D, kTransformClass2D,
+ kTransformClass2D, kTransformClass2D,
+ kTransformClass2D, kTransformClass2D,
+ kTransformClass2D, kTransformClass2D,
+ kTransformClass2D, kTransformClass2D,
+ kTransformClassVertical, kTransformClassHorizontal,
+ kTransformClassVertical, kTransformClassHorizontal,
+ kTransformClassVertical, kTransformClassHorizontal,
+ };
+ for (int i = 0; i < kNumTransformTypes; ++i) {
+ EXPECT_EQ(GetTransformClass(static_cast<TransformType>(i)),
+ expected_classes[i])
+ << "Mismatch at index " << i;
+ }
+}
+
+TEST(BitstreamUtilTest, RowOrColumn4x4ToPixel) {
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 0), 40);
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 1),
+ 40); // Subsampling should have no effect on Y plane.
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 0), 40);
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 1), 20);
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 0), 40);
+ EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 1), 20);
+}
+
+TEST(BitstreamUtilTest, GetPlaneType) {
+ EXPECT_EQ(GetPlaneType(kPlaneY), kPlaneTypeY);
+ EXPECT_EQ(GetPlaneType(kPlaneU), kPlaneTypeUV);
+ EXPECT_EQ(GetPlaneType(kPlaneV), kPlaneTypeUV);
+}
+
+TEST(BitstreamUtils, IsDirectionalMode) {
+ static constexpr bool is_directional_modes[kNumPredictionModes] = {
+ false, true, true, true, true, true, true, true, true,
+ false, false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ };
+ for (int i = 0; i < kNumPredictionModes; ++i) {
+ EXPECT_EQ(IsDirectionalMode(static_cast<PredictionMode>(i)),
+ is_directional_modes[i])
+ << "Mismatch at index " << i;
+ }
+}
+
+TEST(BitstreamUtils, GetRelativeDistance) {
+ // Both order_hint_bits and order_hint_shift_bits are zero. (a and b must be
+ // zero.)
+ EXPECT_EQ(GetRelativeDistance(0, 0, 0), 0);
+ EXPECT_EQ(GetRelativeDistance(10, 20, 27), -10);
+
+ EXPECT_EQ(GetRelativeDistance(2, 1, 30), 1);
+ EXPECT_EQ(GetRelativeDistance(2, 1, 29), 1);
+
+ EXPECT_EQ(GetRelativeDistance(1, 2, 30), -1);
+ EXPECT_EQ(GetRelativeDistance(1, 2, 29), -1);
+
+ // With an order_hint_bits of 4 and an order_hint_shift_bits of 28, 16 is the
+ // same as 0, 17 is the same as 1, etc. The most positive distance is 7, and
+ // the most negative distance is -8.
+
+ EXPECT_EQ(GetRelativeDistance(2, 6, 28), -4);
+ EXPECT_EQ(GetRelativeDistance(6, 2, 28), 4);
+ // 18 - 14 = 4.
+ EXPECT_EQ(GetRelativeDistance(2, 14, 28), 4);
+ // 14 - 18 = -4.
+ EXPECT_EQ(GetRelativeDistance(14, 2, 28), -4);
+ // If a and b are exactly 8 apart, GetRelativeDistance() cannot tell whether
+ // a is before or after b. GetRelativeDistance(a, b) and
+ // GetRelativeDistance(b, a) are both -8.
+ // 1 - 9 = -8.
+ EXPECT_EQ(GetRelativeDistance(1, 9, 28), -8);
+ // 9 - 17 = -8.
+ EXPECT_EQ(GetRelativeDistance(9, 1, 28), -8);
+
+ // With an order_hint_bits of 5 and an order_hint_shift_bits of 27, 32 is the
+ // same as 0, 33 is the same as 1, etc. The most positive distance is 15, and
+ // the most negative distance is -16.
+
+ // 31 - 32 = -1.
+ EXPECT_EQ(GetRelativeDistance(31, 0, 27), -1);
+ // 32 - 31 = 1.
+ EXPECT_EQ(GetRelativeDistance(0, 31, 27), 1);
+ // 30 - 33 = -3.
+ EXPECT_EQ(GetRelativeDistance(30, 1, 27), -3);
+ // 33 - 30 = 3.
+ EXPECT_EQ(GetRelativeDistance(1, 30, 27), 3);
+ // 25 - 36 = -11.
+ EXPECT_EQ(GetRelativeDistance(25, 4, 27), -11);
+ // 36 - 25 = 11.
+ EXPECT_EQ(GetRelativeDistance(4, 25, 27), 11);
+ // 15 - 0 = 15.
+ EXPECT_EQ(GetRelativeDistance(15, 0, 27), 15);
+ // If a and b are exactly 16 apart, GetRelativeDistance() cannot tell whether
+ // a is before or after b. GetRelativeDistance(a, b) and
+ // GetRelativeDistance(b, a) are both -16.
+ // 16 - 32 = -16.
+ EXPECT_EQ(GetRelativeDistance(16, 0, 27), -16);
+ // 0 - 16 = -16.
+ EXPECT_EQ(GetRelativeDistance(0, 16, 27), -16);
+}
+
+TEST(BitstreamUtils, ApplySign) {
+ // ApplyPositive(0) = 0
+ EXPECT_EQ(ApplySign(0, 0), 0);
+ // ApplyNegative(0) = 0
+ EXPECT_EQ(ApplySign(0, -1), 0);
+
+ // ApplyPositive(1) = 1
+ EXPECT_EQ(ApplySign(1, 0), 1);
+ // ApplyNegative(1) = -1
+ EXPECT_EQ(ApplySign(1, -1), -1);
+
+ // ApplyPositive(-1) = -1
+ EXPECT_EQ(ApplySign(-1, 0), -1);
+ // ApplyNegative(-1) = 1
+ EXPECT_EQ(ApplySign(-1, -1), 1);
+
+ // ApplyPositive(1234) = 1234
+ EXPECT_EQ(ApplySign(1234, 0), 1234);
+ // ApplyNegative(1234) = -1234
+ EXPECT_EQ(ApplySign(1234, -1), -1234);
+
+ // ApplyPositive(-1234) = -1234
+ EXPECT_EQ(ApplySign(-1234, 0), -1234);
+ // ApplyNegative(-1234) = 1234
+ EXPECT_EQ(ApplySign(-1234, -1), 1234);
+}
+
+// 7.9.3. (without the clamp for numerator and denominator).
+int SpecGetMvProjectionKernel(int mv, int numerator, int denominator) {
+ int value = mv * numerator * kProjectionMvDivisionLookup[denominator];
+ if (value >= 0) {
+ value += 1 << 13;
+ value >>= 14;
+ } else {
+ value = -value;
+ value += 1 << 13;
+ value >>= 14;
+ value = -value;
+ }
+ if (value < (-(1 << 14) + 1)) value = -(1 << 14) + 1;
+ if (value > (1 << 14) - 1) value = (1 << 14) - 1;
+ return value;
+}
+
+void SpecGetMvProjectionNoClamp(const MotionVector& mv, int numerator,
+ int denominator, MotionVector* projection_mv) {
+ for (int i = 0; i < 2; ++i) {
+ projection_mv->mv[i] =
+ SpecGetMvProjectionKernel(mv.mv[i], numerator, denominator);
+ }
+}
+
+TEST(BitstreamUtils, GetMvProjection) {
+ const int16_t mvs[5][2] = {
+ {0, 0}, {11, 73}, {-84, 272}, {733, -827}, {-472, -697}};
+ for (auto& mv_value : mvs) {
+ for (int numerator = -kMaxFrameDistance; numerator <= kMaxFrameDistance;
+ ++numerator) {
+ for (int denominator = 0; denominator <= kMaxFrameDistance;
+ ++denominator) {
+ MotionVector mv, projection_mv, spec_projection_mv;
+ mv.mv[0] = mv_value[0];
+ mv.mv[1] = mv_value[1];
+ GetMvProjection(mv, numerator, kProjectionMvDivisionLookup[denominator],
+ &projection_mv);
+ SpecGetMvProjectionNoClamp(mv, numerator, denominator,
+ &spec_projection_mv);
+ EXPECT_EQ(projection_mv.mv32, spec_projection_mv.mv32);
+ }
+ }
+ }
+}
+
+// 7.9.4.
+int SpecProject(int value, int delta, int dst_sign) {
+ constexpr int kMiSizeLog2 = 2;
+ const int sign = (dst_sign == 0) ? 1 : dst_sign;
+ int offset;
+ if (delta >= 0) {
+ offset = delta >> (3 + 1 + kMiSizeLog2);
+ } else {
+ offset = -((-delta) >> (3 + 1 + kMiSizeLog2));
+ }
+ return value + sign * offset;
+}
+
+TEST(BitstreamUtils, Project) {
+ for (int value = -10; value <= 10; ++value) {
+ for (int delta = -256; delta <= 256; ++delta) {
+ for (int dst_sign = -1; dst_sign <= 0; ++dst_sign) {
+ EXPECT_EQ(Project(value, delta, dst_sign),
+ SpecProject(value, delta, dst_sign));
+ }
+ }
+ }
+}
+
+TEST(BitstreamUtils, IsBlockSmallerThan8x8) {
+ static constexpr bool is_block_smaller_than8x8[kMaxBlockSizes] = {
+ true, true, false, true, false, false, false, false,
+ false, false, false, false, false, false, false, false,
+ false, false, false, false, false, false,
+ };
+ for (int i = 0; i < kMaxBlockSizes; ++i) {
+ EXPECT_EQ(IsBlockSmallerThan8x8(static_cast<BlockSize>(i)),
+ is_block_smaller_than8x8[i])
+ << "Mismatch at index " << i;
+ }
+}
+
+TEST(BitstreamUtils, TransformSizeToSquareTransformIndex) {
+ EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize4x4), 0);
+ EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize8x8), 1);
+ EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize16x16), 2);
+ EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize32x32), 3);
+ EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize64x64), 4);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+
+// A collection of compiler attribute checks and defines to control for
+// compatibility across toolchains.
+
+//------------------------------------------------------------------------------
+// Language version, attribute and feature helpers.
+
+// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default
+// unless compiled with /Zc:__cplusplus, use the value controlled by /std
+// instead.
+// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define LIBGAV1_CXX17 1
+#else
+#define LIBGAV1_CXX17 0
+#endif
+
+#if defined(__has_attribute)
+#define LIBGAV1_HAS_ATTRIBUTE __has_attribute
+#else
+#define LIBGAV1_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__has_feature)
+#define LIBGAV1_HAS_FEATURE __has_feature
+#else
+#define LIBGAV1_HAS_FEATURE(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBGAV1_ASAN 1
+#else
+#define LIBGAV1_ASAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(memory_sanitizer)
+#define LIBGAV1_MSAN 1
+#else
+#define LIBGAV1_MSAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+#define LIBGAV1_TSAN 1
+#else
+#define LIBGAV1_TSAN 0
+#endif
+
+//------------------------------------------------------------------------------
+// AddressSanitizer support.
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if LIBGAV1_ASAN
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+ (static_cast<void>(addr), static_cast<void>(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+ (static_cast<void>(addr), static_cast<void>(size))
+#endif
+
+//------------------------------------------------------------------------------
+// Function attributes.
+// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+// Clang: https://clang.llvm.org/docs/AttributeReference.html
+
+#if defined(__GNUC__)
+#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define LIBGAV1_ALWAYS_INLINE __forceinline
+#else
+#define LIBGAV1_ALWAYS_INLINE inline
+#endif
+
+// LIBGAV1_MUST_USE_RESULT
+//
+// Tells the compiler to warn about unused results.
+//
+// When annotating a function, it must appear as the first part of the
+// declaration or definition. The compiler will warn if the return value from
+// such a function is unused:
+//
+// LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket();
+// AllocateSprocket(); // Triggers a warning.
+//
+// When annotating a class, it is equivalent to annotating every function which
+// returns an instance.
+//
+// class LIBGAV1_MUST_USE_RESULT Sprocket {};
+// Sprocket(); // Triggers a warning.
+//
+// Sprocket MakeSprocket();
+// MakeSprocket(); // Triggers a warning.
+//
+// Note that references and pointers are not instances:
+//
+// Sprocket* SprocketPointer();
+// SprocketPointer(); // Does *not* trigger a warning.
+//
+// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused
+// result warning. For that, warn_unused_result is used only for clang but not
+// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+#if LIBGAV1_HAS_ATTRIBUTE(nodiscard)
+#define LIBGAV1_MUST_USE_RESULT [[nodiscard]]
+#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result)
+#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define LIBGAV1_MUST_USE_RESULT
+#endif
+
+// LIBGAV1_PRINTF_ATTRIBUTE
+//
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+//
+// Note: As the GCC manual states, "[s]ince non-static C++ methods
+// have an implicit 'this' argument, the arguments of such methods
+// should be counted from two, not one."
+#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+//------------------------------------------------------------------------------
+// Thread annotations.
+
+// LIBGAV1_GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex
+// that should be held when accessing the annotated variable.
+//
+// Although this annotation cannot be applied to local variables, a local
+// variable and its associated mutex can often be combined into a small class
+// or struct, thereby allowing the annotation.
+//
+// Example:
+//
+// class Foo {
+// Mutex mu_;
+// int p1_ LIBGAV1_GUARDED_BY(mu_);
+// ...
+// };
+// TODO(b/133245043): this can be reenabled after a local MutexLock
+// implementation is added with proper thread annotations.
+#if 0 // LIBGAV1_HAS_ATTRIBUTE(guarded_by)
+#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x)))
+#else
+#define LIBGAV1_GUARDED_BY(x)
+#endif
+
+//------------------------------------------------------------------------------
+
+#undef LIBGAV1_HAS_ATTRIBUTE
+#undef LIBGAV1_HAS_FEATURE
+
+#endif // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5};
+
+const uint8_t k4x4HeightLog2[kMaxBlockSizes] = {
+ 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5};
+
+const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = {
+ 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32};
+
+const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = {
+ 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32};
+
+const uint8_t kBlockWidthPixels[kMaxBlockSizes] = {
+ 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16,
+ 16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128};
+
+const uint8_t kBlockHeightPixels[kMaxBlockSizes] = {
+ 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32,
+ 64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128};
+
+// 9.3 -- Partition_Subsize[]
+const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = {
+ // kPartitionNone
+ {kBlock4x4, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64, kBlockInvalid,
+ kBlockInvalid, kBlock128x128},
+ // kPartitionHorizontal
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionVertical
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, kBlockInvalid,
+ kBlockInvalid, kBlock64x64},
+ // kPartitionHorizontalWithTopSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionHorizontalWithBottomSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid,
+ kBlockInvalid, kBlock128x64},
+ // kPartitionVerticalWithLeftSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionVerticalWithRightSplit
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid,
+ kBlockInvalid, kBlock64x128},
+ // kPartitionHorizontal4
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid},
+ // kPartitionVertical4
+ {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32,
+ kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64, kBlockInvalid,
+ kBlockInvalid, kBlockInvalid}};
+
+// 5.11.38 (implemented as a simple look up. first dimension is block size,
+// second and third are subsampling_x and subsampling_y).
+const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = {
+ {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}},
+ {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}},
+ {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}},
+ {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}},
+ {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}},
+ {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}},
+ {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}},
+ {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}},
+ {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}},
+ {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}},
+ {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}},
+ {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}},
+ {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}},
+ {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}},
+ {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}},
+ {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}},
+ {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}},
+ {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}},
+ {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}},
+ {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}},
+ {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}},
+ {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}};
+
+const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+const uint8_t kTransformWidth[kNumTransformSizes] = {
+ 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64};
+
+const uint8_t kTransformHeight[kNumTransformSizes] = {
+ 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64};
+
+const uint8_t kTransformWidth4x4[kNumTransformSizes] = {
+ 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16};
+
+const uint8_t kTransformHeight4x4[kNumTransformSizes] = {
+ 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16};
+
+const uint8_t kTransformWidthLog2[kNumTransformSizes] = {
+ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6};
+
+const uint8_t kTransformHeightLog2[kNumTransformSizes] = {
+ 2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6};
+
+// 9.3 -- Split_Tx_Size[]
+const TransformSize kSplitTransformSize[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x4, kTransformSize4x8,
+ kTransformSize4x4, kTransformSize4x4, kTransformSize8x8,
+ kTransformSize8x16, kTransformSize8x4, kTransformSize8x8,
+ kTransformSize8x8, kTransformSize16x16, kTransformSize16x32,
+ kTransformSize16x8, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x32};
+
+// Square transform of size min(w,h).
+const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize4x4, kTransformSize4x4,
+ kTransformSize4x4, kTransformSize8x8, kTransformSize8x8,
+ kTransformSize8x8, kTransformSize4x4, kTransformSize8x8,
+ kTransformSize16x16, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize8x8, kTransformSize16x16, kTransformSize32x32,
+ kTransformSize32x32, kTransformSize16x16, kTransformSize32x32,
+ kTransformSize64x64};
+
+// Square transform of size max(w,h).
+const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = {
+ kTransformSize4x4, kTransformSize8x8, kTransformSize16x16,
+ kTransformSize8x8, kTransformSize8x8, kTransformSize16x16,
+ kTransformSize32x32, kTransformSize16x16, kTransformSize16x16,
+ kTransformSize16x16, kTransformSize32x32, kTransformSize64x64,
+ kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+ kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+ kTransformSize64x64};
+
+const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2};
+
+const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = {
+ {2, 12, 1, 4}, {2, 15, 1, 6}, {2, 18, 1, 8}, {2, 21, 1, 9},
+ {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13},
+ {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5}, {0, 0, 1, 8},
+ {0, 0, 1, 11}, {0, 0, 1, 14}, {2, 30, 0, 0}, {2, 75, 0, 0}};
+
+const int8_t kSgrProjMultiplierMin[2] = {-96, -32};
+
+const int8_t kSgrProjMultiplierMax[2] = {31, 95};
+
+const int8_t kWienerTapsMin[3] = {-5, -23, -17};
+
+const int8_t kWienerTapsMax[3] = {10, 8, 46};
+
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed NEON operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+ kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, 1, 128, 2, 1, 0, 0},
+ {0, 1, 3, 127, 4, 2, 1, 0}, {0, 1, 4, 127, 6, 3, 1, 0},
+ {0, 2, 6, 126, 8, 3, 1, 0}, {0, 2, 7, 125, 11, 4, 1, 0},
+ {1, 2, 8, 125, 13, 5, 2, 0}, {1, 3, 9, 124, 15, 6, 2, 0},
+ {1, 3, 10, 123, 18, 6, 2, 1}, {1, 3, 11, 122, 20, 7, 3, 1},
+ {1, 4, 12, 121, 22, 8, 3, 1}, {1, 4, 13, 120, 25, 9, 3, 1},
+ {1, 4, 14, 118, 28, 9, 3, 1}, {1, 4, 15, 117, 30, 10, 4, 1},
+ {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+ {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+ {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+ {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+ {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+ {1, 6, 20, 97, 58, 17, 6, 1}, {1, 6, 20, 95, 61, 18, 6, 1},
+ {2, 7, 20, 93, 64, 18, 6, 2}, {2, 7, 20, 91, 66, 19, 6, 1},
+ {2, 7, 20, 88, 69, 19, 6, 1}, {2, 7, 20, 86, 71, 19, 6, 1},
+ {2, 7, 20, 84, 74, 20, 7, 2}, {2, 7, 20, 81, 76, 20, 7, 1},
+ {2, 7, 20, 79, 79, 20, 7, 2}, {1, 7, 20, 76, 81, 20, 7, 2},
+ {2, 7, 20, 74, 84, 20, 7, 2}, {1, 6, 19, 71, 86, 20, 7, 2},
+ {1, 6, 19, 69, 88, 20, 7, 2}, {1, 6, 19, 66, 91, 20, 7, 2},
+ {2, 6, 18, 64, 93, 20, 7, 2}, {1, 6, 18, 61, 95, 20, 6, 1},
+ {1, 6, 17, 58, 97, 20, 6, 1}, {1, 6, 17, 56, 99, 20, 6, 1},
+ {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+ {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+ {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+ {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+ {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+ {1, 3, 9, 28, 118, 14, 4, 1}, {1, 3, 9, 25, 120, 13, 4, 1},
+ {1, 3, 8, 22, 121, 12, 4, 1}, {1, 3, 7, 20, 122, 11, 3, 1},
+ {1, 2, 6, 18, 123, 10, 3, 1}, {0, 2, 6, 15, 124, 9, 3, 1},
+ {0, 2, 5, 13, 125, 8, 2, 1}, {0, 1, 4, 11, 125, 7, 2, 0},
+ {0, 1, 3, 8, 126, 6, 2, 0}, {0, 1, 3, 6, 127, 4, 1, 0},
+ {0, 1, 2, 4, 127, 3, 1, 0}, {0, 0, 1, 2, 128, 1, 0, 0},
+};
+
+alignas(8) const int8_t
+ kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+ // [-1, 0).
+ {0, 0, 127, 1, 0, 0, 0, 0},
+ {0, -1, 127, 2, 0, 0, 0, 0},
+ {1, -3, 127, 4, -1, 0, 0, 0},
+ {1, -4, 126, 6, -2, 1, 0, 0},
+ {1, -5, 126, 8, -3, 1, 0, 0},
+ {1, -6, 125, 11, -4, 1, 0, 0},
+ {1, -7, 124, 13, -4, 1, 0, 0},
+ {2, -8, 123, 15, -5, 1, 0, 0},
+ {2, -9, 122, 18, -6, 1, 0, 0},
+ {2, -10, 121, 20, -6, 1, 0, 0},
+ {2, -11, 120, 22, -7, 2, 0, 0},
+ {2, -12, 119, 25, -8, 2, 0, 0},
+ {3, -13, 117, 27, -8, 2, 0, 0},
+ {3, -13, 116, 29, -9, 2, 0, 0},
+ {3, -14, 114, 32, -10, 3, 0, 0},
+ {3, -15, 113, 35, -10, 2, 0, 0},
+ {3, -15, 111, 37, -11, 3, 0, 0},
+ {3, -16, 109, 40, -11, 3, 0, 0},
+ {3, -16, 108, 42, -12, 3, 0, 0},
+ {4, -17, 106, 45, -13, 3, 0, 0},
+ {4, -17, 104, 47, -13, 3, 0, 0},
+ {4, -17, 102, 50, -14, 3, 0, 0},
+ {4, -17, 100, 52, -14, 3, 0, 0},
+ {4, -18, 98, 55, -15, 4, 0, 0},
+ {4, -18, 96, 58, -15, 3, 0, 0},
+ {4, -18, 94, 60, -16, 4, 0, 0},
+ {4, -18, 91, 63, -16, 4, 0, 0},
+ {4, -18, 89, 65, -16, 4, 0, 0},
+ {4, -18, 87, 68, -17, 4, 0, 0},
+ {4, -18, 85, 70, -17, 4, 0, 0},
+ {4, -18, 82, 73, -17, 4, 0, 0},
+ {4, -18, 80, 75, -17, 4, 0, 0},
+ {4, -18, 78, 78, -18, 4, 0, 0},
+ {4, -17, 75, 80, -18, 4, 0, 0},
+ {4, -17, 73, 82, -18, 4, 0, 0},
+ {4, -17, 70, 85, -18, 4, 0, 0},
+ {4, -17, 68, 87, -18, 4, 0, 0},
+ {4, -16, 65, 89, -18, 4, 0, 0},
+ {4, -16, 63, 91, -18, 4, 0, 0},
+ {4, -16, 60, 94, -18, 4, 0, 0},
+ {3, -15, 58, 96, -18, 4, 0, 0},
+ {4, -15, 55, 98, -18, 4, 0, 0},
+ {3, -14, 52, 100, -17, 4, 0, 0},
+ {3, -14, 50, 102, -17, 4, 0, 0},
+ {3, -13, 47, 104, -17, 4, 0, 0},
+ {3, -13, 45, 106, -17, 4, 0, 0},
+ {3, -12, 42, 108, -16, 3, 0, 0},
+ {3, -11, 40, 109, -16, 3, 0, 0},
+ {3, -11, 37, 111, -15, 3, 0, 0},
+ {2, -10, 35, 113, -15, 3, 0, 0},
+ {3, -10, 32, 114, -14, 3, 0, 0},
+ {2, -9, 29, 116, -13, 3, 0, 0},
+ {2, -8, 27, 117, -13, 3, 0, 0},
+ {2, -8, 25, 119, -12, 2, 0, 0},
+ {2, -7, 22, 120, -11, 2, 0, 0},
+ {1, -6, 20, 121, -10, 2, 0, 0},
+ {1, -6, 18, 122, -9, 2, 0, 0},
+ {1, -5, 15, 123, -8, 2, 0, 0},
+ {1, -4, 13, 124, -7, 1, 0, 0},
+ {1, -4, 11, 125, -6, 1, 0, 0},
+ {1, -3, 8, 126, -5, 1, 0, 0},
+ {1, -2, 6, 126, -4, 1, 0, 0},
+ {0, -1, 4, 127, -3, 1, 0, 0},
+ {0, 0, 2, 127, -1, 0, 0, 0},
+ // [0, 1).
+ {0, 0, 0, 127, 1, 0, 0, 0},
+ {0, 0, -1, 127, 2, 0, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0},
+ {0, 1, -5, 127, 6, -2, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0},
+ {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1},
+ {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1},
+ {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1},
+ {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1},
+ {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1},
+ {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1},
+ {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2},
+ {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2},
+ {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2},
+ {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2},
+ {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2},
+ {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2},
+ {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2},
+ {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2},
+ {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2},
+ {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2},
+ {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2},
+ {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2},
+ {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2},
+ {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2},
+ {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2},
+ {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2},
+ {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2},
+ {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1},
+ {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1},
+ {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1},
+ {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1},
+ {-1, 2, -4, 11, 126, -7, 2, -1},
+ {0, 1, -3, 8, 126, -6, 2, 0},
+ {0, 1, -2, 6, 127, -5, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0},
+ {0, 0, 0, 2, 127, -1, 0, 0},
+ // [1, 2).
+ {0, 0, 0, 1, 127, 0, 0, 0},
+ {0, 0, 0, -1, 127, 2, 0, 0},
+ {0, 0, 1, -3, 127, 4, -1, 0},
+ {0, 0, 1, -4, 126, 6, -2, 1},
+ {0, 0, 1, -5, 126, 8, -3, 1},
+ {0, 0, 1, -6, 125, 11, -4, 1},
+ {0, 0, 1, -7, 124, 13, -4, 1},
+ {0, 0, 2, -8, 123, 15, -5, 1},
+ {0, 0, 2, -9, 122, 18, -6, 1},
+ {0, 0, 2, -10, 121, 20, -6, 1},
+ {0, 0, 2, -11, 120, 22, -7, 2},
+ {0, 0, 2, -12, 119, 25, -8, 2},
+ {0, 0, 3, -13, 117, 27, -8, 2},
+ {0, 0, 3, -13, 116, 29, -9, 2},
+ {0, 0, 3, -14, 114, 32, -10, 3},
+ {0, 0, 3, -15, 113, 35, -10, 2},
+ {0, 0, 3, -15, 111, 37, -11, 3},
+ {0, 0, 3, -16, 109, 40, -11, 3},
+ {0, 0, 3, -16, 108, 42, -12, 3},
+ {0, 0, 4, -17, 106, 45, -13, 3},
+ {0, 0, 4, -17, 104, 47, -13, 3},
+ {0, 0, 4, -17, 102, 50, -14, 3},
+ {0, 0, 4, -17, 100, 52, -14, 3},
+ {0, 0, 4, -18, 98, 55, -15, 4},
+ {0, 0, 4, -18, 96, 58, -15, 3},
+ {0, 0, 4, -18, 94, 60, -16, 4},
+ {0, 0, 4, -18, 91, 63, -16, 4},
+ {0, 0, 4, -18, 89, 65, -16, 4},
+ {0, 0, 4, -18, 87, 68, -17, 4},
+ {0, 0, 4, -18, 85, 70, -17, 4},
+ {0, 0, 4, -18, 82, 73, -17, 4},
+ {0, 0, 4, -18, 80, 75, -17, 4},
+ {0, 0, 4, -18, 78, 78, -18, 4},
+ {0, 0, 4, -17, 75, 80, -18, 4},
+ {0, 0, 4, -17, 73, 82, -18, 4},
+ {0, 0, 4, -17, 70, 85, -18, 4},
+ {0, 0, 4, -17, 68, 87, -18, 4},
+ {0, 0, 4, -16, 65, 89, -18, 4},
+ {0, 0, 4, -16, 63, 91, -18, 4},
+ {0, 0, 4, -16, 60, 94, -18, 4},
+ {0, 0, 3, -15, 58, 96, -18, 4},
+ {0, 0, 4, -15, 55, 98, -18, 4},
+ {0, 0, 3, -14, 52, 100, -17, 4},
+ {0, 0, 3, -14, 50, 102, -17, 4},
+ {0, 0, 3, -13, 47, 104, -17, 4},
+ {0, 0, 3, -13, 45, 106, -17, 4},
+ {0, 0, 3, -12, 42, 108, -16, 3},
+ {0, 0, 3, -11, 40, 109, -16, 3},
+ {0, 0, 3, -11, 37, 111, -15, 3},
+ {0, 0, 2, -10, 35, 113, -15, 3},
+ {0, 0, 3, -10, 32, 114, -14, 3},
+ {0, 0, 2, -9, 29, 116, -13, 3},
+ {0, 0, 2, -8, 27, 117, -13, 3},
+ {0, 0, 2, -8, 25, 119, -12, 2},
+ {0, 0, 2, -7, 22, 120, -11, 2},
+ {0, 0, 1, -6, 20, 121, -10, 2},
+ {0, 0, 1, -6, 18, 122, -9, 2},
+ {0, 0, 1, -5, 15, 123, -8, 2},
+ {0, 0, 1, -4, 13, 124, -7, 1},
+ {0, 0, 1, -4, 11, 125, -6, 1},
+ {0, 0, 1, -3, 8, 126, -5, 1},
+ {0, 0, 1, -2, 6, 126, -4, 1},
+ {0, 0, 0, -1, 4, 127, -3, 1},
+ {0, 0, 0, 0, 2, 127, -1, 0},
+ // dummy, replicate row index 191.
+ {0, 0, 0, 0, 2, 127, -1, 0}};
+
+alignas(16) const int16_t
+ kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+ // [-1, 0).
+ {0, 0, 127, 1, 0, 0, 0, 0},
+ {0, -1, 127, 2, 0, 0, 0, 0},
+ {1, -3, 127, 4, -1, 0, 0, 0},
+ {1, -4, 126, 6, -2, 1, 0, 0},
+ {1, -5, 126, 8, -3, 1, 0, 0},
+ {1, -6, 125, 11, -4, 1, 0, 0},
+ {1, -7, 124, 13, -4, 1, 0, 0},
+ {2, -8, 123, 15, -5, 1, 0, 0},
+ {2, -9, 122, 18, -6, 1, 0, 0},
+ {2, -10, 121, 20, -6, 1, 0, 0},
+ {2, -11, 120, 22, -7, 2, 0, 0},
+ {2, -12, 119, 25, -8, 2, 0, 0},
+ {3, -13, 117, 27, -8, 2, 0, 0},
+ {3, -13, 116, 29, -9, 2, 0, 0},
+ {3, -14, 114, 32, -10, 3, 0, 0},
+ {3, -15, 113, 35, -10, 2, 0, 0},
+ {3, -15, 111, 37, -11, 3, 0, 0},
+ {3, -16, 109, 40, -11, 3, 0, 0},
+ {3, -16, 108, 42, -12, 3, 0, 0},
+ {4, -17, 106, 45, -13, 3, 0, 0},
+ {4, -17, 104, 47, -13, 3, 0, 0},
+ {4, -17, 102, 50, -14, 3, 0, 0},
+ {4, -17, 100, 52, -14, 3, 0, 0},
+ {4, -18, 98, 55, -15, 4, 0, 0},
+ {4, -18, 96, 58, -15, 3, 0, 0},
+ {4, -18, 94, 60, -16, 4, 0, 0},
+ {4, -18, 91, 63, -16, 4, 0, 0},
+ {4, -18, 89, 65, -16, 4, 0, 0},
+ {4, -18, 87, 68, -17, 4, 0, 0},
+ {4, -18, 85, 70, -17, 4, 0, 0},
+ {4, -18, 82, 73, -17, 4, 0, 0},
+ {4, -18, 80, 75, -17, 4, 0, 0},
+ {4, -18, 78, 78, -18, 4, 0, 0},
+ {4, -17, 75, 80, -18, 4, 0, 0},
+ {4, -17, 73, 82, -18, 4, 0, 0},
+ {4, -17, 70, 85, -18, 4, 0, 0},
+ {4, -17, 68, 87, -18, 4, 0, 0},
+ {4, -16, 65, 89, -18, 4, 0, 0},
+ {4, -16, 63, 91, -18, 4, 0, 0},
+ {4, -16, 60, 94, -18, 4, 0, 0},
+ {3, -15, 58, 96, -18, 4, 0, 0},
+ {4, -15, 55, 98, -18, 4, 0, 0},
+ {3, -14, 52, 100, -17, 4, 0, 0},
+ {3, -14, 50, 102, -17, 4, 0, 0},
+ {3, -13, 47, 104, -17, 4, 0, 0},
+ {3, -13, 45, 106, -17, 4, 0, 0},
+ {3, -12, 42, 108, -16, 3, 0, 0},
+ {3, -11, 40, 109, -16, 3, 0, 0},
+ {3, -11, 37, 111, -15, 3, 0, 0},
+ {2, -10, 35, 113, -15, 3, 0, 0},
+ {3, -10, 32, 114, -14, 3, 0, 0},
+ {2, -9, 29, 116, -13, 3, 0, 0},
+ {2, -8, 27, 117, -13, 3, 0, 0},
+ {2, -8, 25, 119, -12, 2, 0, 0},
+ {2, -7, 22, 120, -11, 2, 0, 0},
+ {1, -6, 20, 121, -10, 2, 0, 0},
+ {1, -6, 18, 122, -9, 2, 0, 0},
+ {1, -5, 15, 123, -8, 2, 0, 0},
+ {1, -4, 13, 124, -7, 1, 0, 0},
+ {1, -4, 11, 125, -6, 1, 0, 0},
+ {1, -3, 8, 126, -5, 1, 0, 0},
+ {1, -2, 6, 126, -4, 1, 0, 0},
+ {0, -1, 4, 127, -3, 1, 0, 0},
+ {0, 0, 2, 127, -1, 0, 0, 0},
+ // [0, 1).
+ {0, 0, 0, 127, 1, 0, 0, 0},
+ {0, 0, -1, 127, 2, 0, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0},
+ {0, 1, -5, 127, 6, -2, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0},
+ {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1},
+ {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1},
+ {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1},
+ {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1},
+ {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1},
+ {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1},
+ {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2},
+ {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2},
+ {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2},
+ {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2},
+ {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2},
+ {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2},
+ {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2},
+ {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2},
+ {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2},
+ {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2},
+ {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2},
+ {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2},
+ {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2},
+ {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2},
+ {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2},
+ {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2},
+ {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2},
+ {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1},
+ {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1},
+ {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1},
+ {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1},
+ {-1, 2, -4, 11, 126, -7, 2, -1},
+ {0, 1, -3, 8, 126, -6, 2, 0},
+ {0, 1, -2, 6, 127, -5, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0},
+ {0, 0, 0, 2, 127, -1, 0, 0},
+ // [1, 2).
+ {0, 0, 0, 1, 127, 0, 0, 0},
+ {0, 0, 0, -1, 127, 2, 0, 0},
+ {0, 0, 1, -3, 127, 4, -1, 0},
+ {0, 0, 1, -4, 126, 6, -2, 1},
+ {0, 0, 1, -5, 126, 8, -3, 1},
+ {0, 0, 1, -6, 125, 11, -4, 1},
+ {0, 0, 1, -7, 124, 13, -4, 1},
+ {0, 0, 2, -8, 123, 15, -5, 1},
+ {0, 0, 2, -9, 122, 18, -6, 1},
+ {0, 0, 2, -10, 121, 20, -6, 1},
+ {0, 0, 2, -11, 120, 22, -7, 2},
+ {0, 0, 2, -12, 119, 25, -8, 2},
+ {0, 0, 3, -13, 117, 27, -8, 2},
+ {0, 0, 3, -13, 116, 29, -9, 2},
+ {0, 0, 3, -14, 114, 32, -10, 3},
+ {0, 0, 3, -15, 113, 35, -10, 2},
+ {0, 0, 3, -15, 111, 37, -11, 3},
+ {0, 0, 3, -16, 109, 40, -11, 3},
+ {0, 0, 3, -16, 108, 42, -12, 3},
+ {0, 0, 4, -17, 106, 45, -13, 3},
+ {0, 0, 4, -17, 104, 47, -13, 3},
+ {0, 0, 4, -17, 102, 50, -14, 3},
+ {0, 0, 4, -17, 100, 52, -14, 3},
+ {0, 0, 4, -18, 98, 55, -15, 4},
+ {0, 0, 4, -18, 96, 58, -15, 3},
+ {0, 0, 4, -18, 94, 60, -16, 4},
+ {0, 0, 4, -18, 91, 63, -16, 4},
+ {0, 0, 4, -18, 89, 65, -16, 4},
+ {0, 0, 4, -18, 87, 68, -17, 4},
+ {0, 0, 4, -18, 85, 70, -17, 4},
+ {0, 0, 4, -18, 82, 73, -17, 4},
+ {0, 0, 4, -18, 80, 75, -17, 4},
+ {0, 0, 4, -18, 78, 78, -18, 4},
+ {0, 0, 4, -17, 75, 80, -18, 4},
+ {0, 0, 4, -17, 73, 82, -18, 4},
+ {0, 0, 4, -17, 70, 85, -18, 4},
+ {0, 0, 4, -17, 68, 87, -18, 4},
+ {0, 0, 4, -16, 65, 89, -18, 4},
+ {0, 0, 4, -16, 63, 91, -18, 4},
+ {0, 0, 4, -16, 60, 94, -18, 4},
+ {0, 0, 3, -15, 58, 96, -18, 4},
+ {0, 0, 4, -15, 55, 98, -18, 4},
+ {0, 0, 3, -14, 52, 100, -17, 4},
+ {0, 0, 3, -14, 50, 102, -17, 4},
+ {0, 0, 3, -13, 47, 104, -17, 4},
+ {0, 0, 3, -13, 45, 106, -17, 4},
+ {0, 0, 3, -12, 42, 108, -16, 3},
+ {0, 0, 3, -11, 40, 109, -16, 3},
+ {0, 0, 3, -11, 37, 111, -15, 3},
+ {0, 0, 2, -10, 35, 113, -15, 3},
+ {0, 0, 3, -10, 32, 114, -14, 3},
+ {0, 0, 2, -9, 29, 116, -13, 3},
+ {0, 0, 2, -8, 27, 117, -13, 3},
+ {0, 0, 2, -8, 25, 119, -12, 2},
+ {0, 0, 2, -7, 22, 120, -11, 2},
+ {0, 0, 1, -6, 20, 121, -10, 2},
+ {0, 0, 1, -6, 18, 122, -9, 2},
+ {0, 0, 1, -5, 15, 123, -8, 2},
+ {0, 0, 1, -4, 13, 124, -7, 1},
+ {0, 0, 1, -4, 11, 125, -6, 1},
+ {0, 0, 1, -3, 8, 126, -5, 1},
+ {0, 0, 1, -2, 6, 126, -4, 1},
+ {0, 0, 0, -1, 4, 127, -3, 1},
+ {0, 0, 0, 0, 2, 127, -1, 0},
+ // dummy, replicate row index 191.
+ {0, 0, 0, 0, 2, 127, -1, 0}};
+
+// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify
+// calculations by reducing the range by 1 bit.
+alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = {
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, -3, 63, 4, -1, 0, 0},
+ {0, 1, -5, 61, 9, -2, 0, 0},
+ {0, 1, -6, 58, 14, -4, 1, 0},
+ {0, 1, -7, 55, 19, -5, 1, 0},
+ {0, 1, -7, 51, 24, -6, 1, 0},
+ {0, 1, -8, 47, 29, -6, 1, 0},
+ {0, 1, -7, 42, 33, -6, 1, 0},
+ {0, 1, -7, 38, 38, -7, 1, 0},
+ {0, 1, -6, 33, 42, -7, 1, 0},
+ {0, 1, -6, 29, 47, -8, 1, 0},
+ {0, 1, -6, 24, 51, -7, 1, 0},
+ {0, 1, -5, 19, 55, -7, 1, 0},
+ {0, 1, -4, 14, 58, -6, 1, 0},
+ {0, 0, -2, 9, 61, -5, 1, 0},
+ {0, 0, -1, 4, 63, -3, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 14, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, -1, 8, 27, 24, 6, 0, 0},
+ {0, -1, 7, 26, 26, 7, -1, 0},
+ {0, 0, 6, 24, 27, 8, -1, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 14, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {-1, 1, -3, 63, 4, -1, 1, 0},
+ {-1, 3, -6, 62, 8, -3, 2, -1},
+ {-1, 4, -9, 60, 13, -5, 3, -1},
+ {-2, 5, -11, 58, 19, -7, 3, -1},
+ {-2, 5, -11, 54, 24, -9, 4, -1},
+ {-2, 5, -12, 50, 30, -10, 4, -1},
+ {-2, 5, -12, 45, 35, -11, 5, -1},
+ {-2, 6, -12, 40, 40, -12, 6, -2},
+ {-1, 5, -11, 35, 45, -12, 5, -2},
+ {-1, 4, -10, 30, 50, -12, 5, -2},
+ {-1, 4, -9, 24, 54, -11, 5, -2},
+ {-1, 3, -7, 19, 58, -11, 5, -2},
+ {-1, 3, -5, 13, 60, -9, 4, -1},
+ {-1, 2, -3, 8, 62, -6, 3, -1},
+ {0, 1, -1, 4, 63, -3, 1, -1}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 0, 60, 4, 0, 0, 0},
+ {0, 0, 0, 56, 8, 0, 0, 0},
+ {0, 0, 0, 52, 12, 0, 0, 0},
+ {0, 0, 0, 48, 16, 0, 0, 0},
+ {0, 0, 0, 44, 20, 0, 0, 0},
+ {0, 0, 0, 40, 24, 0, 0, 0},
+ {0, 0, 0, 36, 28, 0, 0, 0},
+ {0, 0, 0, 32, 32, 0, 0, 0},
+ {0, 0, 0, 28, 36, 0, 0, 0},
+ {0, 0, 0, 24, 40, 0, 0, 0},
+ {0, 0, 0, 20, 44, 0, 0, 0},
+ {0, 0, 0, 16, 48, 0, 0, 0},
+ {0, 0, 0, 12, 52, 0, 0, 0},
+ {0, 0, 0, 8, 56, 0, 0, 0},
+ {0, 0, 0, 4, 60, 0, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, -2, 63, 4, -1, 0, 0},
+ {0, 0, -4, 61, 9, -2, 0, 0},
+ {0, 0, -5, 58, 14, -3, 0, 0},
+ {0, 0, -6, 55, 19, -4, 0, 0},
+ {0, 0, -6, 51, 24, -5, 0, 0},
+ {0, 0, -7, 47, 29, -5, 0, 0},
+ {0, 0, -6, 42, 33, -5, 0, 0},
+ {0, 0, -6, 38, 38, -6, 0, 0},
+ {0, 0, -5, 33, 42, -6, 0, 0},
+ {0, 0, -5, 29, 47, -7, 0, 0},
+ {0, 0, -5, 24, 51, -6, 0, 0},
+ {0, 0, -4, 19, 55, -6, 0, 0},
+ {0, 0, -3, 14, 58, -5, 0, 0},
+ {0, 0, -2, 9, 61, -4, 0, 0},
+ {0, 0, -1, 4, 63, -2, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 15, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 0, 7, 27, 24, 6, 0, 0},
+ {0, 0, 6, 26, 26, 6, 0, 0},
+ {0, 0, 6, 24, 27, 7, 0, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know
+// the pattern of the signs and account for it in other ways.
+const uint8_t kAbsHalfSubPixelFilters[6][16][8] = {
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 3, 63, 4, 1, 0, 0},
+ {0, 1, 5, 61, 9, 2, 0, 0},
+ {0, 1, 6, 58, 14, 4, 1, 0},
+ {0, 1, 7, 55, 19, 5, 1, 0},
+ {0, 1, 7, 51, 24, 6, 1, 0},
+ {0, 1, 8, 47, 29, 6, 1, 0},
+ {0, 1, 7, 42, 33, 6, 1, 0},
+ {0, 1, 7, 38, 38, 7, 1, 0},
+ {0, 1, 6, 33, 42, 7, 1, 0},
+ {0, 1, 6, 29, 47, 8, 1, 0},
+ {0, 1, 6, 24, 51, 7, 1, 0},
+ {0, 1, 5, 19, 55, 7, 1, 0},
+ {0, 1, 4, 14, 58, 6, 1, 0},
+ {0, 0, 2, 9, 61, 5, 1, 0},
+ {0, 0, 1, 4, 63, 3, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 1, 14, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 1, 8, 27, 24, 6, 0, 0},
+ {0, 1, 7, 26, 26, 7, 1, 0},
+ {0, 0, 6, 24, 27, 8, 1, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 14, 1, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {1, 1, 3, 63, 4, 1, 1, 0},
+ {1, 3, 6, 62, 8, 3, 2, 1},
+ {1, 4, 9, 60, 13, 5, 3, 1},
+ {2, 5, 11, 58, 19, 7, 3, 1},
+ {2, 5, 11, 54, 24, 9, 4, 1},
+ {2, 5, 12, 50, 30, 10, 4, 1},
+ {2, 5, 12, 45, 35, 11, 5, 1},
+ {2, 6, 12, 40, 40, 12, 6, 2},
+ {1, 5, 11, 35, 45, 12, 5, 2},
+ {1, 4, 10, 30, 50, 12, 5, 2},
+ {1, 4, 9, 24, 54, 11, 5, 2},
+ {1, 3, 7, 19, 58, 11, 5, 2},
+ {1, 3, 5, 13, 60, 9, 4, 1},
+ {1, 2, 3, 8, 62, 6, 3, 1},
+ {0, 1, 1, 4, 63, 3, 1, 1}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 0, 60, 4, 0, 0, 0},
+ {0, 0, 0, 56, 8, 0, 0, 0},
+ {0, 0, 0, 52, 12, 0, 0, 0},
+ {0, 0, 0, 48, 16, 0, 0, 0},
+ {0, 0, 0, 44, 20, 0, 0, 0},
+ {0, 0, 0, 40, 24, 0, 0, 0},
+ {0, 0, 0, 36, 28, 0, 0, 0},
+ {0, 0, 0, 32, 32, 0, 0, 0},
+ {0, 0, 0, 28, 36, 0, 0, 0},
+ {0, 0, 0, 24, 40, 0, 0, 0},
+ {0, 0, 0, 20, 44, 0, 0, 0},
+ {0, 0, 0, 16, 48, 0, 0, 0},
+ {0, 0, 0, 12, 52, 0, 0, 0},
+ {0, 0, 0, 8, 56, 0, 0, 0},
+ {0, 0, 0, 4, 60, 0, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 2, 63, 4, 1, 0, 0},
+ {0, 0, 4, 61, 9, 2, 0, 0},
+ {0, 0, 5, 58, 14, 3, 0, 0},
+ {0, 0, 6, 55, 19, 4, 0, 0},
+ {0, 0, 6, 51, 24, 5, 0, 0},
+ {0, 0, 7, 47, 29, 5, 0, 0},
+ {0, 0, 6, 42, 33, 5, 0, 0},
+ {0, 0, 6, 38, 38, 6, 0, 0},
+ {0, 0, 5, 33, 42, 6, 0, 0},
+ {0, 0, 5, 29, 47, 7, 0, 0},
+ {0, 0, 5, 24, 51, 6, 0, 0},
+ {0, 0, 4, 19, 55, 6, 0, 0},
+ {0, 0, 3, 14, 58, 5, 0, 0},
+ {0, 0, 2, 9, 61, 4, 0, 0},
+ {0, 0, 1, 4, 63, 2, 0, 0}},
+ {{0, 0, 0, 64, 0, 0, 0, 0},
+ {0, 0, 15, 31, 17, 1, 0, 0},
+ {0, 0, 13, 31, 18, 2, 0, 0},
+ {0, 0, 11, 31, 20, 2, 0, 0},
+ {0, 0, 10, 30, 21, 3, 0, 0},
+ {0, 0, 9, 29, 22, 4, 0, 0},
+ {0, 0, 8, 28, 23, 5, 0, 0},
+ {0, 0, 7, 27, 24, 6, 0, 0},
+ {0, 0, 6, 26, 26, 6, 0, 0},
+ {0, 0, 6, 24, 27, 7, 0, 0},
+ {0, 0, 5, 23, 28, 8, 0, 0},
+ {0, 0, 4, 22, 29, 9, 0, 0},
+ {0, 0, 3, 21, 30, 10, 0, 0},
+ {0, 0, 2, 20, 31, 11, 0, 0},
+ {0, 0, 2, 18, 31, 13, 0, 0},
+ {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// 9.3 -- Dr_Intra_Derivative[]
+// This is a more compact version of the table from the spec. angle / 2 - 1 is
+// used as the lookup. Note angle / 3 - 1 would work too, but the calculation
+// becomes more costly.
+const int16_t kDirectionalIntraPredictorDerivative[44] = {
+ // Approx angle
+ 1023, 0, // 3, ...
+ 547, // 6, ...
+ 372, 0, 0, // 9, ...
+ 273, // 14, ...
+ 215, 0, // 17, ...
+ 178, // 20, ...
+ 151, 0, // 23, ... (113 & 203 are base angles)
+ 132, // 26, ...
+ 116, 0, // 29, ...
+ 102, 0, // 32, ...
+ 90, // 36, ...
+ 80, 0, // 39, ...
+ 71, // 42, ...
+ 64, 0, // 45, ... (45 & 135 are base angles)
+ 57, // 48, ...
+ 51, 0, // 51, ...
+ 45, 0, // 54, ...
+ 40, // 58, ...
+ 35, 0, // 61, ...
+ 31, // 64, ...
+ 27, 0, // 67, ... (67 & 157 are base angles)
+ 23, // 70, ...
+ 19, 0, // 73, ...
+ 15, 0, // 76, ...
+ 11, 0, // 81, ...
+ 7, // 84, ...
+ 3, // 87, ...
+};
+
+const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
+ {0, 1}, {2, 2}, {3, 3}};
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_
+#define LIBGAV1_SRC_UTILS_CONSTANTS_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/utils/bit_mask_set.h"
+
+namespace libgav1 {
+
+// Returns the number of elements between begin (inclusive) and end (inclusive).
+constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
+
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+ kMaxThreads = LIBGAV1_MAX_THREADS
+#else
+ kMaxThreads = 128
+#endif
+}; // anonymous enum
+
+enum {
+ // Documentation variables.
+ kBitdepth8 = 8,
+ kBitdepth10 = 10,
+ kBitdepth12 = 12,
+ kInvalidMvValue = -32768,
+ kCdfMaxProbability = 32768,
+ kBlockWidthCount = 5,
+ kMaxSegments = 8,
+ kMinQuantizer = 0,
+ kMinLossyQuantizer = 1,
+ kMaxQuantizer = 255,
+ // Quantizer matrix is used only when level < 15.
+ kNumQuantizerLevelsForQuantizerMatrix = 15,
+ kFrameLfCount = 4,
+ kMaxLoopFilterValue = 63,
+ kNum4x4In64x64 = 256,
+ kMaxAngleDelta = 3,
+ kDirectionalIntraModes = 8,
+ kMaxSuperBlockSizeLog2 = 7,
+ kMinSuperBlockSizeLog2 = 6,
+ kGlobalMotionReadControl = 3,
+ kSuperResScaleNumerator = 8,
+ kBooleanSymbolCount = 2,
+ kRestorationTypeSymbolCount = 3,
+ kSgrProjParamsBits = 4,
+ kSgrProjPrecisionBits = 7,
+ // Precision of a division table (mtable)
+ kSgrProjScaleBits = 20,
+ kSgrProjReciprocalBits = 12,
+ // Core self-guided restoration precision bits.
+ kSgrProjSgrBits = 8,
+ // Precision bits of generated values higher than source before projection.
+ kSgrProjRestoreBits = 4,
+ // Padding on left and right side of a restoration block.
+ // 3 is enough, but padding to 4 is more efficient, and makes the temporary
+ // source buffer 8-pixel aligned.
+ kRestorationHorizontalBorder = 4,
+ // Padding on top and bottom side of a restoration block.
+ kRestorationVerticalBorder = 2,
+ kCdefBorder = 2, // Padding on each side of a cdef block.
+ kConvolveBorderLeftTop = 3, // Left/top padding of a convolve block.
+ // Right/bottom padding of a convolve block. This needs to be 4 at minimum,
+ // but was increased to simplify the SIMD loads in
+ // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON().
+ kConvolveBorderRight = 8,
+ kConvolveScaleBorderRight = 15,
+ kConvolveBorderBottom = 4,
+ kSubPixelTaps = 8,
+ kWienerFilterBits = 7,
+ kWienerFilterTaps = 7,
+ kMaxPaletteSize = 8,
+ kMinPaletteSize = 2,
+ kMaxPaletteSquare = 64,
+ kBorderPixels = 64,
+ // The final blending process for film grain needs room to overwrite and read
+ // with SIMD instructions. The maximum overwrite is 7 pixels, but the border
+ // is required to be a multiple of 32 by YuvBuffer::Realloc, so that
+ // subsampled chroma borders are 16-aligned.
+ kBorderPixelsFilmGrain = 32,
+ // These constants are the minimum left, right, top, and bottom border sizes
+ // in pixels as an extension of the frame boundary. The minimum border sizes
+ // are derived from the following requirements:
+ // - Warp_C() may read up to 13 pixels before or after a row.
+ // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14
+ // pixels after a row, but the value of the last read pixel is not used.
+ // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and
+ // 13 pixels below the bottom row.
+ kMinLeftBorderPixels = 13,
+ kMinRightBorderPixels = 13,
+ kMinTopBorderPixels = 13,
+ kMinBottomBorderPixels = 13,
+ kWarpedModelPrecisionBits = 16,
+ kMaxRefMvStackSize = 8,
+ kMaxLeastSquaresSamples = 8,
+ kMaxTemporalMvCandidates = 19,
+ // The SIMD implementations of motion vection projection functions always
+ // process 2 or 4 elements together, so we pad the corresponding buffers to
+ // size 20.
+ kMaxTemporalMvCandidatesWithPadding = 20,
+ kMaxSuperBlockSizeInPixels = 128,
+ kMaxScaledSuperBlockSizeInPixels = 128 * 2,
+ kMaxSuperBlockSizeSquareInPixels = 128 * 128,
+ kNum4x4InLoopFilterUnit = 16,
+ kNum4x4InLoopRestorationUnit = 16,
+ kProjectionMvClamp = (1 << 14) - 1, // == 16383
+ kProjectionMvMaxHorizontalOffset = 8,
+ kCdefUnitSize = 64,
+ kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder,
+ kRestorationUnitOffset = 8,
+ // Loop restoration's processing unit size is fixed as 64x64.
+ kRestorationUnitHeight = 64,
+ kRestorationUnitWidth = 256,
+ kRestorationUnitHeightWithBorders =
+ kRestorationUnitHeight + 2 * kRestorationVerticalBorder,
+ kRestorationUnitWidthWithBorders =
+ kRestorationUnitWidth + 2 * kRestorationHorizontalBorder,
+ kSuperResFilterBits = 6,
+ kSuperResFilterShifts = 1 << kSuperResFilterBits,
+ kSuperResFilterTaps = 8,
+ kSuperResScaleBits = 14,
+ kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
+ kSuperResScaleMask = (1 << 14) - 1,
+ kSuperResHorizontalBorder = 4,
+ kSuperResVerticalBorder = 1,
+ // The SIMD implementations of superres calculate up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. Set the padding to 16 for alignment purposes.
+ kSuperResHorizontalPadding = 16,
+ // TODO(chengchen): consider merging these constants:
+ // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
+ // They are designed to match AV1 convolution, which increases coeff
+ // values up to 7 bits. We could consider to combine them and use kFilterBits
+ // only.
+ kFilterBits = 7,
+ // Sub pixel is used in AV1 to represent a pixel location that is not at
+ // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of
+ // integer pixel. Sub pixel values are interpolated using adjacent integer
+ // pixel values. The interpolation is a filtering process.
+ kSubPixelBits = 4,
+ kSubPixelMask = (1 << kSubPixelBits) - 1,
+ // Precision bits when computing inter prediction locations.
+ kScaleSubPixelBits = 10,
+ kWarpParamRoundingBits = 6,
+ // Number of fractional bits of lookup in divisor lookup table.
+ kDivisorLookupBits = 8,
+ // Number of fractional bits of entries in divisor lookup table.
+ kDivisorLookupPrecisionBits = 14,
+ // Number of phases used in warped filtering.
+ kWarpedPixelPrecisionShifts = 1 << 6,
+ kResidualPaddingVertical = 4,
+ kWedgeMaskMasterSize = 64,
+ kMaxFrameDistance = 31,
+ kReferenceFrameScalePrecision = 14,
+ kNumWienerCoefficients = 3,
+ kLoopFilterMaxModeDeltas = 2,
+ kMaxCdefStrengths = 8,
+ kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available.
+ kMaxTileColumns = 64,
+ kMaxTileRows = 64,
+ kMaxOperatingPoints = 32,
+ // There can be a maximum of 4 spatial layers and 8 temporal layers.
+ kMaxLayers = 32,
+ // The cache line size should ideally be queried at run time. 64 is a common
+ // cache line size of x86 CPUs. Web searches showed the cache line size of ARM
+ // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all
+ // CPUs that we care about, even though it is excessive for some ARM
+ // CPUs.
+ //
+ // On Linux, the cache line size can be looked up with the command:
+ // getconf LEVEL1_DCACHE_LINESIZE
+ kCacheLineSize = 64,
+ // InterRound0, Section 7.11.3.2.
+ kInterRoundBitsHorizontal = 3, // 8 & 10-bit.
+ kInterRoundBitsHorizontal12bpp = 5,
+ kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction.
+ kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction.
+ kInterRoundBitsVertical12bpp = 9,
+ // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+ // uint16_t. Removed before blending.
+ kCompoundOffset = (1 << 14) + (1 << 13),
+}; // anonymous enum
+
+enum FrameType : uint8_t {
+ kFrameKey,
+ kFrameInter,
+ kFrameIntraOnly,
+ kFrameSwitch
+};
+
+enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV };
+enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 };
+
+// The plane types, called luma and chroma in the spec.
+enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes };
+
+enum ReferenceFrameType : int8_t {
+ kReferenceFrameNone = -1,
+ kReferenceFrameIntra,
+ kReferenceFrameLast,
+ kReferenceFrameLast2,
+ kReferenceFrameLast3,
+ kReferenceFrameGolden,
+ kReferenceFrameBackward,
+ kReferenceFrameAlternate2,
+ kReferenceFrameAlternate,
+ kNumReferenceFrameTypes,
+ kNumInterReferenceFrameTypes =
+ EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate),
+ kNumForwardReferenceTypes =
+ EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden),
+ kNumBackwardReferenceTypes =
+ EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate)
+};
+
+enum {
+ // Unidirectional compound reference pairs that are signaled explicitly:
+ // {kReferenceFrameLast, kReferenceFrameLast2},
+ // {kReferenceFrameLast, kReferenceFrameLast3},
+ // {kReferenceFrameLast, kReferenceFrameGolden},
+ // {kReferenceFrameBackward, kReferenceFrameAlternate}
+ kExplicitUnidirectionalCompoundReferences = 4,
+ // Other unidirectional compound reference pairs:
+ // {kReferenceFrameLast2, kReferenceFrameLast3},
+ // {kReferenceFrameLast2, kReferenceFrameGolden},
+ // {kReferenceFrameLast3, kReferenceFrameGolden},
+ // {kReferenceFrameBackward, kReferenceFrameAlternate2},
+ // {kReferenceFrameAlternate2, kReferenceFrameAlternate}
+ kUnidirectionalCompoundReferences =
+ kExplicitUnidirectionalCompoundReferences + 5,
+}; // anonymous enum
+
+enum BlockSize : uint8_t {
+ kBlock4x4,
+ kBlock4x8,
+ kBlock4x16,
+ kBlock8x4,
+ kBlock8x8,
+ kBlock8x16,
+ kBlock8x32,
+ kBlock16x4,
+ kBlock16x8,
+ kBlock16x16,
+ kBlock16x32,
+ kBlock16x64,
+ kBlock32x8,
+ kBlock32x16,
+ kBlock32x32,
+ kBlock32x64,
+ kBlock64x16,
+ kBlock64x32,
+ kBlock64x64,
+ kBlock64x128,
+ kBlock128x64,
+ kBlock128x128,
+ kMaxBlockSizes,
+ kBlockInvalid
+};
+
+// Partition types. R: Recursive
+//
+// None Horizontal Vertical Split
+// +-------+ +-------+ +---+---+ +---+---+
+// | | | | | | | | R | R |
+// | | +-------+ | | | +---+---+
+// | | | | | | | | R | R |
+// +-------+ +-------+ +---+---+ +---+---+
+//
+// Horizontal Horizontal Vertical Vertical
+// with top with bottom with left with right
+// split split split split
+// +---+---+ +-------+ +---+---+ +---+---+
+// | | | | | | | | | | |
+// +---+---+ +---+---+ +---+ | | +---+
+// | | | | | | | | | | |
+// +-------+ +---+---+ +---+---+ +---+---+
+//
+// Horizontal4 Vertical4
+// +-----+ +-+-+-+
+// +-----+ | | | |
+// +-----+ | | | |
+// +-----+ +-+-+-+
+enum Partition : uint8_t {
+ kPartitionNone,
+ kPartitionHorizontal,
+ kPartitionVertical,
+ kPartitionSplit,
+ kPartitionHorizontalWithTopSplit,
+ kPartitionHorizontalWithBottomSplit,
+ kPartitionVerticalWithLeftSplit,
+ kPartitionVerticalWithRightSplit,
+ kPartitionHorizontal4,
+ kPartitionVertical4
+};
+enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 };
+
+enum PredictionMode : uint8_t {
+ // Intra prediction modes.
+ kPredictionModeDc,
+ kPredictionModeVertical,
+ kPredictionModeHorizontal,
+ kPredictionModeD45,
+ kPredictionModeD135,
+ kPredictionModeD113,
+ kPredictionModeD157,
+ kPredictionModeD203,
+ kPredictionModeD67,
+ kPredictionModeSmooth,
+ kPredictionModeSmoothVertical,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModePaeth,
+ kPredictionModeChromaFromLuma,
+ // Single inter prediction modes.
+ kPredictionModeNearestMv,
+ kPredictionModeNearMv,
+ kPredictionModeGlobalMv,
+ kPredictionModeNewMv,
+ // Compound inter prediction modes.
+ kPredictionModeNearestNearestMv,
+ kPredictionModeNearNearMv,
+ kPredictionModeNearestNewMv,
+ kPredictionModeNewNearestMv,
+ kPredictionModeNearNewMv,
+ kPredictionModeNewNearMv,
+ kPredictionModeGlobalGlobalMv,
+ kPredictionModeNewNewMv,
+ kNumPredictionModes,
+ kNumCompoundInterPredictionModes =
+ EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv),
+ kIntraPredictionModesY =
+ EnumRangeLength(kPredictionModeDc, kPredictionModePaeth),
+ kIntraPredictionModesUV =
+ EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma),
+ kPredictionModeInvalid = 255
+};
+
+enum InterIntraMode : uint8_t {
+ kInterIntraModeDc,
+ kInterIntraModeVertical,
+ kInterIntraModeHorizontal,
+ kInterIntraModeSmooth,
+ kNumInterIntraModes
+};
+
+enum MotionMode : uint8_t {
+ kMotionModeSimple,
+ kMotionModeObmc, // Overlapped block motion compensation.
+ kMotionModeLocalWarp,
+ kNumMotionModes
+};
+
+enum TxMode : uint8_t {
+ kTxModeOnly4x4,
+ kTxModeLargest,
+ kTxModeSelect,
+ kNumTxModes
+};
+
+// These enums are named as kType1Type2 where Type1 is the transform type for
+// the rows and Type2 is the transform type for the columns.
+enum TransformType : uint8_t {
+ kTransformTypeDctDct,
+ kTransformTypeAdstDct,
+ kTransformTypeDctAdst,
+ kTransformTypeAdstAdst,
+ kTransformTypeFlipadstDct,
+ kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstFlipadst,
+ kTransformTypeAdstFlipadst,
+ kTransformTypeFlipadstAdst,
+ kTransformTypeIdentityIdentity,
+ kTransformTypeIdentityDct,
+ kTransformTypeDctIdentity,
+ kTransformTypeIdentityAdst,
+ kTransformTypeAdstIdentity,
+ kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstIdentity,
+ kNumTransformTypes
+};
+
+constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct,
+ kTransformTypeFlipadstAdst,
+ kTransformTypeFlipadstIdentity,
+ kTransformTypeFlipadstFlipadst);
+constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst,
+ kTransformTypeAdstFlipadst,
+ kTransformTypeIdentityFlipadst,
+ kTransformTypeFlipadstFlipadst);
+
+enum TransformSize : uint8_t {
+ kTransformSize4x4,
+ kTransformSize4x8,
+ kTransformSize4x16,
+ kTransformSize8x4,
+ kTransformSize8x8,
+ kTransformSize8x16,
+ kTransformSize8x32,
+ kTransformSize16x4,
+ kTransformSize16x8,
+ kTransformSize16x16,
+ kTransformSize16x32,
+ kTransformSize16x64,
+ kTransformSize32x8,
+ kTransformSize32x16,
+ kTransformSize32x32,
+ kTransformSize32x64,
+ kTransformSize64x16,
+ kTransformSize64x32,
+ kTransformSize64x64,
+ kNumTransformSizes
+};
+
+enum TransformSet : uint8_t {
+ // DCT Only (1).
+ kTransformSetDctOnly,
+ // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical
+ // DCT (2) = Total (7).
+ kTransformSetIntra1,
+ // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5).
+ kTransformSetIntra2,
+ // All transforms = Total (16).
+ kTransformSetInter1,
+ // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical
+ // DCT (2) = Total (12).
+ kTransformSetInter2,
+ // DCT (1) + Identity (1) = Total (2).
+ kTransformSetInter3,
+ kNumTransformSets
+};
+
+enum TransformClass : uint8_t {
+ kTransformClass2D,
+ kTransformClassHorizontal,
+ kTransformClassVertical,
+ kNumTransformClasses
+};
+
+enum FilterIntraPredictor : uint8_t {
+ kFilterIntraPredictorDc,
+ kFilterIntraPredictorVertical,
+ kFilterIntraPredictorHorizontal,
+ kFilterIntraPredictorD157,
+ kFilterIntraPredictorPaeth,
+ kNumFilterIntraPredictors
+};
+
+enum ObmcDirection : uint8_t {
+ kObmcDirectionVertical,
+ kObmcDirectionHorizontal,
+ kNumObmcDirections
+};
+
+// In AV1 the name of the filter refers to the direction of filter application.
+// Horizontal refers to the column edge and vertical the row edge.
+enum LoopFilterType : uint8_t {
+ kLoopFilterTypeVertical,
+ kLoopFilterTypeHorizontal,
+ kNumLoopFilterTypes
+};
+
+enum LoopFilterTransformSizeId : uint8_t {
+ kLoopFilterTransformSizeId4x4,
+ kLoopFilterTransformSizeId8x8,
+ kLoopFilterTransformSizeId16x16,
+ kNumLoopFilterTransformSizeIds
+};
+
+enum LoopRestorationType : uint8_t {
+ kLoopRestorationTypeNone,
+ kLoopRestorationTypeSwitchable,
+ kLoopRestorationTypeWiener,
+ kLoopRestorationTypeSgrProj, // self guided projection filter.
+ kNumLoopRestorationTypes
+};
+
+enum CompoundReferenceType : uint8_t {
+ kCompoundReferenceUnidirectional,
+ kCompoundReferenceBidirectional,
+ kNumCompoundReferenceTypes
+};
+
+enum CompoundPredictionType : uint8_t {
+ kCompoundPredictionTypeWedge,
+ kCompoundPredictionTypeDiffWeighted,
+ kCompoundPredictionTypeAverage,
+ kCompoundPredictionTypeIntra,
+ kCompoundPredictionTypeDistance,
+ kNumCompoundPredictionTypes,
+ // Number of compound prediction types that are explicitly signaled in the
+ // bitstream (in the compound_type syntax element).
+ kNumExplicitCompoundPredictionTypes = 2
+};
+
+enum InterpolationFilter : uint8_t {
+ kInterpolationFilterEightTap,
+ kInterpolationFilterEightTapSmooth,
+ kInterpolationFilterEightTapSharp,
+ kInterpolationFilterBilinear,
+ kInterpolationFilterSwitchable,
+ kNumInterpolationFilters,
+ // Number of interpolation filters that can be explicitly signaled in the
+ // compressed headers (when the uncompressed headers allow switchable
+ // interpolation filters) of the bitstream.
+ kNumExplicitInterpolationFilters = EnumRangeLength(
+ kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp)
+};
+
+enum MvJointType : uint8_t {
+ kMvJointTypeZero,
+ kMvJointTypeHorizontalNonZeroVerticalZero,
+ kMvJointTypeHorizontalZeroVerticalNonZero,
+ kMvJointTypeNonZero,
+ kNumMvJointTypes
+};
+
+enum ObuType : int8_t {
+ kObuInvalid = -1,
+ kObuSequenceHeader = 1,
+ kObuTemporalDelimiter = 2,
+ kObuFrameHeader = 3,
+ kObuTileGroup = 4,
+ kObuMetadata = 5,
+ kObuFrame = 6,
+ kObuRedundantFrameHeader = 7,
+ kObuTileList = 8,
+ kObuPadding = 15,
+};
+
+constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth,
+ kPredictionModeSmoothHorizontal,
+ kPredictionModeSmoothVertical);
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const BlockSize size) {
+ switch (size) {
+ case kBlock4x4:
+ return "kBlock4x4";
+ case kBlock4x8:
+ return "kBlock4x8";
+ case kBlock4x16:
+ return "kBlock4x16";
+ case kBlock8x4:
+ return "kBlock8x4";
+ case kBlock8x8:
+ return "kBlock8x8";
+ case kBlock8x16:
+ return "kBlock8x16";
+ case kBlock8x32:
+ return "kBlock8x32";
+ case kBlock16x4:
+ return "kBlock16x4";
+ case kBlock16x8:
+ return "kBlock16x8";
+ case kBlock16x16:
+ return "kBlock16x16";
+ case kBlock16x32:
+ return "kBlock16x32";
+ case kBlock16x64:
+ return "kBlock16x64";
+ case kBlock32x8:
+ return "kBlock32x8";
+ case kBlock32x16:
+ return "kBlock32x16";
+ case kBlock32x32:
+ return "kBlock32x32";
+ case kBlock32x64:
+ return "kBlock32x64";
+ case kBlock64x16:
+ return "kBlock64x16";
+ case kBlock64x32:
+ return "kBlock64x32";
+ case kBlock64x64:
+ return "kBlock64x64";
+ case kBlock64x128:
+ return "kBlock64x128";
+ case kBlock128x64:
+ return "kBlock128x64";
+ case kBlock128x128:
+ return "kBlock128x128";
+ case kMaxBlockSizes:
+ return "kMaxBlockSizes";
+ case kBlockInvalid:
+ return "kBlockInvalid";
+ }
+ abort();
+}
+
+inline const char* ToString(const InterIntraMode mode) {
+ switch (mode) {
+ case kInterIntraModeDc:
+ return "kInterIntraModeDc";
+ case kInterIntraModeVertical:
+ return "kInterIntraModeVertical";
+ case kInterIntraModeHorizontal:
+ return "kInterIntraModeHorizontal";
+ case kInterIntraModeSmooth:
+ return "kInterIntraModeSmooth";
+ case kNumInterIntraModes:
+ return "kNumInterIntraModes";
+ }
+ abort();
+}
+
+inline const char* ToString(const ObmcDirection direction) {
+ switch (direction) {
+ case kObmcDirectionVertical:
+ return "kObmcDirectionVertical";
+ case kObmcDirectionHorizontal:
+ return "kObmcDirectionHorizontal";
+ case kNumObmcDirections:
+ return "kNumObmcDirections";
+ }
+ abort();
+}
+
+inline const char* ToString(const LoopRestorationType type) {
+ switch (type) {
+ case kLoopRestorationTypeNone:
+ return "kLoopRestorationTypeNone";
+ case kLoopRestorationTypeSwitchable:
+ return "kLoopRestorationTypeSwitchable";
+ case kLoopRestorationTypeWiener:
+ return "kLoopRestorationTypeWiener";
+ case kLoopRestorationTypeSgrProj:
+ return "kLoopRestorationTypeSgrProj";
+ case kNumLoopRestorationTypes:
+ return "kNumLoopRestorationTypes";
+ }
+ abort();
+}
+
+inline const char* ToString(const TransformSize size) {
+ switch (size) {
+ case kTransformSize4x4:
+ return "kTransformSize4x4";
+ case kTransformSize4x8:
+ return "kTransformSize4x8";
+ case kTransformSize4x16:
+ return "kTransformSize4x16";
+ case kTransformSize8x4:
+ return "kTransformSize8x4";
+ case kTransformSize8x8:
+ return "kTransformSize8x8";
+ case kTransformSize8x16:
+ return "kTransformSize8x16";
+ case kTransformSize8x32:
+ return "kTransformSize8x32";
+ case kTransformSize16x4:
+ return "kTransformSize16x4";
+ case kTransformSize16x8:
+ return "kTransformSize16x8";
+ case kTransformSize16x16:
+ return "kTransformSize16x16";
+ case kTransformSize16x32:
+ return "kTransformSize16x32";
+ case kTransformSize16x64:
+ return "kTransformSize16x64";
+ case kTransformSize32x8:
+ return "kTransformSize32x8";
+ case kTransformSize32x16:
+ return "kTransformSize32x16";
+ case kTransformSize32x32:
+ return "kTransformSize32x32";
+ case kTransformSize32x64:
+ return "kTransformSize32x64";
+ case kTransformSize64x16:
+ return "kTransformSize64x16";
+ case kTransformSize64x32:
+ return "kTransformSize64x32";
+ case kTransformSize64x64:
+ return "kTransformSize64x64";
+ case kNumTransformSizes:
+ return "kNumTransformSizes";
+ }
+ abort();
+}
+
+inline const char* ToString(const TransformType type) {
+ switch (type) {
+ case kTransformTypeDctDct:
+ return "kTransformTypeDctDct";
+ case kTransformTypeAdstDct:
+ return "kTransformTypeAdstDct";
+ case kTransformTypeDctAdst:
+ return "kTransformTypeDctAdst";
+ case kTransformTypeAdstAdst:
+ return "kTransformTypeAdstAdst";
+ case kTransformTypeFlipadstDct:
+ return "kTransformTypeFlipadstDct";
+ case kTransformTypeDctFlipadst:
+ return "kTransformTypeDctFlipadst";
+ case kTransformTypeFlipadstFlipadst:
+ return "kTransformTypeFlipadstFlipadst";
+ case kTransformTypeAdstFlipadst:
+ return "kTransformTypeAdstFlipadst";
+ case kTransformTypeFlipadstAdst:
+ return "kTransformTypeFlipadstAdst";
+ case kTransformTypeIdentityIdentity:
+ return "kTransformTypeIdentityIdentity";
+ case kTransformTypeIdentityDct:
+ return "kTransformTypeIdentityDct";
+ case kTransformTypeDctIdentity:
+ return "kTransformTypeDctIdentity";
+ case kTransformTypeIdentityAdst:
+ return "kTransformTypeIdentityAdst";
+ case kTransformTypeAdstIdentity:
+ return "kTransformTypeAdstIdentity";
+ case kTransformTypeIdentityFlipadst:
+ return "kTransformTypeIdentityFlipadst";
+ case kTransformTypeFlipadstIdentity:
+ return "kTransformTypeFlipadstIdentity";
+ // case to quiet compiler
+ case kNumTransformTypes:
+ return "kNumTransformTypes";
+ }
+ abort();
+}
+
+//------------------------------------------------------------------------------
+
+extern const uint8_t k4x4WidthLog2[kMaxBlockSizes];
+
+extern const uint8_t k4x4HeightLog2[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes];
+
+extern const uint8_t kBlockWidthPixels[kMaxBlockSizes];
+
+extern const uint8_t kBlockHeightPixels[kMaxBlockSizes];
+
+extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes];
+
+extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2];
+
+extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1];
+
+extern const uint8_t kTransformWidth[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight[kNumTransformSizes];
+
+extern const uint8_t kTransformWidth4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformWidthLog2[kNumTransformSizes];
+
+extern const uint8_t kTransformHeightLog2[kNumTransformSizes];
+
+extern const TransformSize kSplitTransformSize[kNumTransformSizes];
+
+// Square transform of size min(w,h).
+extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes];
+
+// Square transform of size max(w,h).
+extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes];
+
+extern const uint8_t kNumTransformTypesInSet[kNumTransformSets];
+
+extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4];
+
+extern const int8_t kSgrProjMultiplierMin[2];
+
+extern const int8_t kSgrProjMultiplierMax[2];
+
+extern const int8_t kWienerTapsMin[3];
+
+extern const int8_t kWienerTapsMax[3];
+
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+ [kSuperResFilterTaps];
+
+// An int8_t version of the kWarpedFilters array.
+// Note: The array could be removed with a performance penalty.
+extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int8_t kHalfSubPixelFilters[6][16][8];
+
+extern const uint8_t kAbsHalfSubPixelFilters[6][16][8];
+
+extern const int16_t kDirectionalIntraPredictorDerivative[44];
+
+extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_CONSTANTS_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#include <cpuid.h>
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include <immintrin.h> // _xgetbv
+#include <intrin.h>
+#endif
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+namespace {
+
+#if defined(__GNUC__)
+void CpuId(int leaf, uint32_t info[4]) {
+ __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]);
+}
+
+uint64_t Xgetbv() {
+ const uint32_t ecx = 0; // ecx specifies the extended control register
+ uint32_t eax;
+ uint32_t edx;
+ __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
+ return (static_cast<uint64_t>(edx) << 32) | eax;
+}
+#else // _MSC_VER
+void CpuId(int leaf, uint32_t info[4]) {
+ __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
+}
+
+uint64_t Xgetbv() { return _xgetbv(0); }
+#endif // __GNUC__
+
+} // namespace
+
+uint32_t GetCpuInfo() {
+ uint32_t info[4];
+
+ // Get the highest feature value cpuid supports
+ CpuId(0, info);
+ const int max_cpuid_value = info[0];
+ if (max_cpuid_value < 1) return 0;
+
+ CpuId(1, info);
+ uint32_t features = 0;
+ if ((info[3] & (1 << 26)) != 0) features |= kSSE2;
+ if ((info[2] & (1 << 9)) != 0) features |= kSSSE3;
+ if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1;
+
+ // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((info[2] & (3 << 27)) == (3 << 27)) {
+ // XMM state and YMM state enabled by the OS
+ if ((Xgetbv() & 0x6) == 0x6) {
+ features |= kAVX;
+ if (max_cpuid_value >= 7) {
+ CpuId(7, info);
+ if ((info[1] & (1 << 5)) != 0) features |= kAVX2;
+ }
+ }
+ }
+
+ return features;
+}
+#else
+uint32_t GetCpuInfo() { return 0; }
+#endif // x86 || x86_64
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CPU_H_
+#define LIBGAV1_SRC_UTILS_CPU_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
+#define LIBGAV1_X86_MSVC
+#endif
+
+#if defined(LIBGAV1_X86)
+
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif // !defined(LIBGAV1_ENABLE_AVX2)
+#else // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#else // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
+
+#if !defined(LIBGAV1_ENABLE_NEON)
+// TODO(jzern): add support for _M_ARM64.
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+ (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENABLE_NEON 0
+#endif
+#endif // !defined(LIBGAV1_ENABLE_NEON)
+
+enum CpuFeatures : uint8_t {
+ kSSE2 = 1 << 0,
+#define LIBGAV1_CPU_SSE2 (1 << 0)
+ kSSSE3 = 1 << 1,
+#define LIBGAV1_CPU_SSSE3 (1 << 1)
+ kSSE4_1 = 1 << 2,
+#define LIBGAV1_CPU_SSE4_1 (1 << 2)
+ kAVX = 1 << 3,
+#define LIBGAV1_CPU_AVX (1 << 3)
+ kAVX2 = 1 << 4,
+#define LIBGAV1_CPU_AVX2 (1 << 4)
+ kNEON = 1 << 5,
+#define LIBGAV1_CPU_NEON (1 << 5)
+};
+
+// Returns a bit-wise OR of CpuFeatures supported by this platform.
+uint32_t GetCpuInfo();
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_CPU_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__linux__)
+#include <unistd.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#endif // defined(__linux__)
+
+#include "gtest/gtest.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+#if defined(__linux__)
+
+// Sample code for getting the number of performance CPU cores. The following
+// sources were consulted:
+// * https://www.kernel.org/doc/html/latest/admin-guide/cputopology.html
+// * cpu-hotplug.txt: CPU hotplug Support in Linux(tm) Kernel
+// https://lwn.net/Articles/537570/
+// * https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
+// * Android bionic source code of get_nprocs():
+// libc/bionic/sysinfo.cpp
+// * glibc 2.30 source code of get_nprocs():
+// sysdeps/unix/sysv/linux/getsysstats.c
+//
+// Tested on:
+// * Asus Nexus 7 2013: Qualcomm Snapdragon 600, 32-bit Android 6.0.1
+// (Marshmallow). Brings cores online and offline dynamically. (The tablet
+// has 4 cores. "0", "0-1", "0-2", and "0-3" have all been observed in the
+// /sys/devices/system/cpu/online file.) This causes the number of cores
+// currently online to potentially be lower than the number of cores that can
+// be brought online quickly.
+// * General Mobile 4G: Qualcomm Snapdragon 410, 32-bit Android 7.1.1 (Nougat).
+// * Motorola Moto G5 Plus: Qualcomm Snapdragon 625, 32-bit Android 8.1.0
+// (Oreo).
+// * Motorola Moto G7 Play: Qualcomm Snapdragon 632, 32-bit Android 9 (Pie).
+// All 8 cores have the same cpuinfo_max_freq (1804800), but there are two
+// values of cpuinfo_min_freq: cores 0-3 have 614400 and cores 4-7 have
+// 633600. We would need to check cpuinfo_min_freq to differentiate the two
+// kinds of cores (Qualcomm Kryo 250 Gold and Qualcomm Kryo 250 Silver).
+// * Pixel 2 XL: Qualcomm Snapdragon 835, 64-bit Android 9 (Pie).
+// * Pixel 3: Qualcomm Snapdragon 845, 64-bit Android 9 (Pie).
+// * Pixel 3a: Qualcomm Snapdragon 670, 64-bit Android 9 (Pie).
+// * Samsung Galaxy S6: Samsung Exynos 7 Octa (7420), 64-bit Android 7.0
+// (Nougat).
+// * Samsung Galaxy S8+ (SM-G955FD): Samsung Exynos 8895, 64-bit Android 8.0.0.
+//
+// Note: The sample code needs to use the 'long' type because it is the return
+// type of the Standard C Library function strtol(). The ClangTidy warnings are
+// suppressed with NOLINT(google-runtime-int) comments.
+
+// Returns the number of online processor cores.
+int GetNumberOfProcessorsOnline() {
+ // See https://developer.android.com/ndk/guides/cpu-features.
+ long num_cpus = sysconf(_SC_NPROCESSORS_ONLN); // NOLINT(google-runtime-int)
+ if (num_cpus < 0) {
+ LIBGAV1_DLOG(ERROR, "sysconf(_SC_NPROCESSORS_ONLN) failed: %s.",
+ strerror(errno));
+ return 0;
+ }
+ // It is safe to cast num_cpus to int. sysconf(_SC_NPROCESSORS_ONLN) returns
+ // the return value of get_nprocs(), which is an int.
+ return static_cast<int>(num_cpus);
+}
+
+// These CPUs support heterogeneous multiprocessing.
+#if defined(__arm__) || defined(__aarch64__)
+
+// A helper function used by GetNumberOfPerformanceCoresOnline().
+//
+// Returns the cpuinfo_max_freq value (in kHz) of the given CPU. Returns 0 on
+// failure.
+long GetCpuinfoMaxFreq(int cpu_index) { // NOLINT(google-runtime-int)
+ char buffer[128];
+ const int rv = snprintf(
+ buffer, sizeof(buffer),
+ "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpu_index);
+ if (rv < 0 || rv >= sizeof(buffer)) {
+ LIBGAV1_DLOG(ERROR, "snprintf failed, or |buffer| is too small.");
+ return 0;
+ }
+ FILE* file = fopen(buffer, "r");
+ if (file == nullptr) {
+ LIBGAV1_DLOG(ERROR, "fopen(\"%s\", \"r\") failed: %s.", buffer,
+ strerror(errno));
+ return 0;
+ }
+ char* const str = fgets(buffer, sizeof(buffer), file);
+ fclose(file);
+ if (str == nullptr) {
+ LIBGAV1_DLOG(ERROR, "fgets failed.");
+ return 0;
+ }
+ const long freq = strtol(str, nullptr, 10); // NOLINT(google-runtime-int)
+ if (freq <= 0 || freq == LONG_MAX) {
+ LIBGAV1_DLOG(ERROR,
+ "No conversion can be performed, or the converted value is "
+ "invalid: %ld.",
+ freq);
+ return 0;
+ }
+ return freq;
+}
+
+// Returns the number of performance CPU cores that are online. The number of
+// efficiency CPU cores is subtracted from the total number of CPU cores. Uses
+// cpuinfo_max_freq to determine whether a CPU is a performance core or an
+// efficiency core.
+//
+// This function is not perfect. For example, the Snapdragon 632 SoC used in
+// Motorola Moto G7 has performance and efficiency cores with the same
+// cpuinfo_max_freq but different cpuinfo_min_freq. This function fails to
+// differentiate the two kinds of cores and reports all the cores as
+// performance cores.
+int GetNumberOfPerformanceCoresOnline() {
+ // Get the online CPU list. Some examples of the online CPU list are:
+ // "0-7"
+ // "0"
+ // "0-1,2,3,4-7"
+ char online[512];
+ FILE* file = fopen("/sys/devices/system/cpu/online", "r");
+ if (file == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "fopen(\"/sys/devices/system/cpu/online\", \"r\") failed: %s.",
+ strerror(errno));
+ return 0;
+ }
+ char* const str = fgets(online, sizeof(online), file);
+ fclose(file);
+ file = nullptr;
+ if (str == nullptr) {
+ LIBGAV1_DLOG(ERROR, "fgets failed.");
+ return 0;
+ }
+ LIBGAV1_DLOG(INFO, "The online CPU list is %s", online);
+
+ // Count the number of the slowest CPUs. Some SoCs such as Snapdragon 855
+ // have performance cores with different max frequencies, so only the slowest
+ // CPUs are efficiency cores. If we count the number of the fastest CPUs, we
+ // will fail to count the second fastest performance cores.
+ long slowest_cpu_freq = LONG_MAX; // NOLINT(google-runtime-int)
+ int num_slowest_cpus = 0;
+ int num_cpus = 0;
+ const char* cp = online;
+ int range_begin = -1;
+ while (true) {
+ char* str_end;
+ const int cpu = static_cast<int>(strtol(cp, &str_end, 10));
+ if (str_end == cp) {
+ break;
+ }
+ cp = str_end;
+ if (*cp == '-') {
+ range_begin = cpu;
+ } else {
+ if (range_begin == -1) {
+ range_begin = cpu;
+ }
+
+ num_cpus += cpu - range_begin + 1;
+ for (int i = range_begin; i <= cpu; ++i) {
+ const long freq = GetCpuinfoMaxFreq(i); // NOLINT(google-runtime-int)
+ if (freq <= 0) {
+ return 0;
+ }
+ LIBGAV1_DLOG(INFO, "cpu%d max frequency is %ld kHz.", i, freq);
+ if (freq < slowest_cpu_freq) {
+ slowest_cpu_freq = freq;
+ num_slowest_cpus = 0;
+ }
+ if (freq == slowest_cpu_freq) {
+ ++num_slowest_cpus;
+ }
+ }
+
+ range_begin = -1;
+ }
+ if (*cp == '\0') {
+ break;
+ }
+ ++cp;
+ }
+
+ LIBGAV1_DLOG(INFO, "There are %d CPU cores.", num_cpus);
+ LIBGAV1_DLOG(INFO,
+ "%d CPU cores are the slowest, with max frequency %ld kHz.",
+ num_slowest_cpus, slowest_cpu_freq);
+ // If there are faster CPU cores than the slowest CPU cores, exclude the
+ // slowest CPU cores.
+ if (num_slowest_cpus < num_cpus) {
+ num_cpus -= num_slowest_cpus;
+ }
+ return num_cpus;
+}
+
+#else
+
+// Assume symmetric multiprocessing.
+int GetNumberOfPerformanceCoresOnline() {
+ return GetNumberOfProcessorsOnline();
+}
+
+#endif
+
+#endif // defined(__linux__)
+
+/*
+ Run this test with logging enabled on an Android device:
+ 64-bit Android:
+ tests/run_android_test.sh --test cpu --enable_asserts
+ 32-bit Android:
+ tests/run_android_test.sh --test cpu --arch arm \
+ --enable_asserts
+*/
+TEST(CpuTest, GetNumberOfPerformanceCoresOnline) {
+#if defined(__linux__)
+ const int num_cpus = GetNumberOfProcessorsOnline();
+ ASSERT_NE(num_cpus, 0);
+ LIBGAV1_DLOG(INFO, "There are %d cores online.", num_cpus);
+ const int num_performance_cpus = GetNumberOfPerformanceCoresOnline();
+ ASSERT_NE(num_performance_cpus, 0);
+ LIBGAV1_DLOG(INFO, "There are %d performance cores online.",
+ num_performance_cpus);
+#endif // defined(__linux__)
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+template <typename T>
+class DynamicBuffer {
+ public:
+ T* get() { return buffer_.get(); }
+ const T* get() const { return buffer_.get(); }
+
+ // Resizes the buffer so that it can hold at least |size| elements. Existing
+ // contents will be destroyed when resizing to a larger size.
+ //
+ // Returns true on success. If Resize() returns false, then subsequent calls
+ // to get() will return nullptr.
+ bool Resize(size_t size) {
+ if (size <= size_) return true;
+ buffer_.reset(new (std::nothrow) T[size]);
+ if (buffer_ == nullptr) {
+ size_ = 0;
+ return false;
+ }
+ size_ = size;
+ return true;
+ }
+
+ size_t size() const { return size_; }
+
+ private:
+ std::unique_ptr<T[]> buffer_;
+ size_t size_ = 0;
+};
+
+template <typename T, int alignment>
+class AlignedDynamicBuffer {
+ public:
+ T* get() { return buffer_.get(); }
+
+ // Resizes the buffer so that it can hold at least |size| elements. Existing
+ // contents will be destroyed when resizing to a larger size.
+ //
+ // Returns true on success. If Resize() returns false, then subsequent calls
+ // to get() will return nullptr.
+ bool Resize(size_t size) {
+ if (size <= size_) return true;
+ buffer_ = MakeAlignedUniquePtr<T>(alignment, size);
+ if (buffer_ == nullptr) {
+ size_ = 0;
+ return false;
+ }
+ size_ = size;
+ return true;
+ }
+
+ private:
+ AlignedUniquePtr<T> buffer_;
+ size_t size_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+ (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+ int index, int symbol_count) {
+ return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+ (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
+ const int symbol) {
+ const uint16_t count = cdf[symbol_count];
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case cdf[N] is |count|.
+ // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+ // symbol_count > 3. So the equation becomes:
+ // 4 + (count > 15) + (count > 31) + (symbol_count > 3).
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 31.
+ // Now, the equation becomes:
+ // 4 + (count >> 4) + (symbol_count > 3).
+ // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+ // with bitwise or:
+ // (4 | (count >> 4)) + (symbol_count > 3).
+ // but using addition will allow the compiler to eliminate an operation when
+ // symbol_count is known and this function is inlined.
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+ // Hints for further optimizations:
+ //
+ // 1. clang can vectorize this for loop with width 4, even though the loop
+ // contains an if-else statement. Therefore, it may be advantageous to use
+ // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+ // (a multiple of 4 that's not too small).
+ //
+ // 2. The for loop can be rewritten in the following form, which would enable
+ // clang to vectorize the loop with width 8:
+ //
+ // const int rounding = (1 << rate) - 1;
+ // for (int i = 0; i < symbol_count - 1; ++i) {
+ // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+ // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+ // }
+ //
+ // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+ // integer arithmetic. The result of the unsigned subtraction is cast to a
+ // signed integer and right-shifted. This requires the right shift of a
+ // signed integer be an arithmetic shift, which is true for clang, gcc, and
+ // Visual C++.
+ assert(symbol_count - 1 > 0);
+ int i = 0;
+ do {
+ if (i < symbol) {
+ cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+ } else {
+ cdf[i] -= cdf[i] >> rate;
+ }
+ } while (++i < symbol_count - 1);
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// if (i < symbol) {
+// cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+// } else {
+// cdf[i] -= cdf[i] >> rate;
+// }
+// }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+// const int rounding = (1 << rate) - 1;
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+// cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+// }
+//
+// or:
+//
+// const int rounding = (1 << rate) - 1;
+// for (int i = 0; i < symbol_count - 1; ++i) {
+// const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+// cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+// }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+// cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+// symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+// the cdf array. Since an invalid CDF value is written into cdf[7], the
+// count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const uint16_t count = cdf[5];
+ const int rate = (count >> 4) + 5;
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+ const uint16x4_t index = vcreate_u16(0x0003000200010000);
+ const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+ const uint16x4_t mask = vcge_u16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+ const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = vadd_u16(cdf_offset, delta);
+ vst1_u16(cdf, cdf_vec);
+ cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+ uint16x8_t cdf_vec = vld1q_u16(cdf);
+ const uint16_t count = cdf[symbol_count];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t delta =
+ vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec);
+ cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+ const uint16_t count = cdf[11];
+ cdf[11] = count + static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ if (symbol > 1) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+ vcreate_u16(0x0009000800070006));
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ const uint16x8_t delta =
+ vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 2, cdf_vec);
+ } else {
+ if (symbol != 0) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+ const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+ cdf_vec = vsubq_u16(cdf_vec, delta);
+ vst1q_u16(cdf + 2, cdf_vec);
+ }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+ uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+ const uint16_t count = cdf[13];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+ uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+ uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec0 = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec0);
+
+ index = vcombine_u16(vcreate_u16(0x0007000600050004),
+ vcreate_u16(0x000b000a00090008));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+ cdf_offset = vsubq_u16(cdf_vec1, mask);
+ delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec1 = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 4, cdf_vec1);
+
+ cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ uint16x8_t cdf_vec = vld1q_u16(cdf);
+ const uint16_t count = cdf[16];
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+ const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+ const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+ uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+ uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf, cdf_vec);
+
+ cdf_vec = vld1q_u16(cdf + 8);
+ index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+ vcreate_u16(0x000f000e000d000c));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ cdf_offset = vsubq_u16(cdf_vec, mask);
+ delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+ cdf_vec = vaddq_u16(cdf_offset, delta);
+ vst1q_u16(cdf + 8, cdf_vec);
+
+ cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ __m128i cdf_vec = LoadLo8(cdf);
+ const uint16_t count = cdf[5];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+ const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+ // i >= symbol.
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreLo8(cdf, cdf_vec);
+ cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+ __m128i cdf_vec = LoadUnaligned16(cdf);
+ const uint16_t count = cdf[symbol_count];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i index =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf, cdf_vec);
+ cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+ const uint16_t count = cdf[11];
+ cdf[11] = count + static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ if (symbol > 1) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i index =
+ _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf + 2, cdf_vec);
+ } else {
+ if (symbol != 0) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ StoreUnaligned16(cdf + 2, cdf_vec);
+ }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ __m128i cdf_vec0 = LoadLo8(cdf);
+ __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+ const uint16_t count = cdf[13];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+ StoreLo8(cdf, cdf_vec0);
+
+ const __m128i index1 =
+ _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+ const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+ StoreUnaligned16(cdf + 4, cdf_vec1);
+
+ cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+ __m128i cdf_vec0 = LoadUnaligned16(cdf);
+ const uint16_t count = cdf[16];
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+ const __m128i index =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+ StoreUnaligned16(cdf, cdf_vec0);
+
+ __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+ const __m128i index1 =
+ _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+ const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+ StoreUnaligned16(cdf + 8, cdf_vec1);
+
+ cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 16, symbol);
+}
+
+#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline EntropyDecoder::WindowSize HostToBigEndian(
+ const EntropyDecoder::WindowSize x) {
+ static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+ return x;
+#endif
+#elif defined(_WIN32)
+ // Note Windows targets are assumed to be little endian.
+ return static_cast<EntropyDecoder::WindowSize>(
+ (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+ : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif // defined(__GNUC__)
+}
+
+} // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int EntropyDecoder::kWindowSize; // static.
+#endif
+
+EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
+ bool allow_update_cdf)
+ : data_(data),
+ data_end_(data + size),
+ data_memcpy_end_((size >= sizeof(WindowSize))
+ ? data + size - sizeof(WindowSize) + 1
+ : data),
+ allow_update_cdf_(allow_update_cdf),
+ values_in_range_(kCdfMaxProbability) {
+ if (data_ < data_memcpy_end_) {
+ // This is a simplified version of PopulateBits() which loads 8 extra bits
+ // and skips the unnecessary shifts of value and window_diff_.
+ WindowSize value;
+ memcpy(&value, data_, sizeof(value));
+ data_ += sizeof(value);
+ window_diff_ = HostToBigEndian(value) ^ -1;
+ // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+ // used to restore the most significant 0 bit that would be present after
+ // PopulateBits() when we extract the first symbol value.
+ // As shown in Section 8.2.2 Initialization process for symbol decoder,
+ // which uses a fixed offset to read the symbol values, the most
+ // significant bit is always 0:
+ // The variable numBits is set equal to Min( sz * 8, 15).
+ // The variable buf is read using the f(numBits) parsing process.
+ // The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+ // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+ bits_ = kWindowSize - 15;
+ return;
+ }
+ window_diff_ = 0;
+ bits_ = -15;
+ PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+// * The probability is fixed at half. So some multiplications can be replaced
+// with bit operations.
+// * Symbol count is fixed at 2.
+int EntropyDecoder::ReadBit() {
+ const uint32_t curr =
+ ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ int bit = 1;
+ if (symbol_value >= curr) {
+ values_in_range_ -= curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ bit = 0;
+ } else {
+ values_in_range_ = curr;
+ }
+ NormalizeRange();
+ return bit;
+}
+
+int64_t EntropyDecoder::ReadLiteral(int num_bits) {
+ assert(num_bits <= 32);
+ assert(num_bits > 0);
+ uint32_t literal = 0;
+ int bit = num_bits - 1;
+ do {
+ // ARM can combine a shift operation with a constant number of bits with
+ // some other operations, such as the OR operation.
+ // Here is an ARM disassembly example:
+ // orr w1, w0, w1, lsl #1
+ // which left shifts register w1 by 1 bit and OR the shift result with
+ // register w0.
+ // The next 2 lines are equivalent to:
+ // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+ literal <<= 1;
+ literal |= static_cast<uint32_t>(ReadBit());
+ } while (--bit >= 0);
+ return literal;
+}
+
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
+ int symbol_count) {
+ const int symbol = ReadSymbolImpl(cdf, symbol_count);
+ if (allow_update_cdf_) {
+ UpdateCdf(cdf, symbol_count, symbol);
+ }
+ return symbol;
+}
+
+bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
+ assert(cdf[1] == 0);
+ const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+ if (allow_update_cdf_) {
+ const uint16_t count = cdf[2];
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+ // 4 + (count > 15) + (count > 31)
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 32.
+ // Now, the equation becomes:
+ // 4 + (count >> 4).
+ // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+ // with bitwise or. So the final equation is:
+ // 4 | (count >> 4).
+ const int rate = 4 | (count >> 4);
+ if (symbol) {
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ } else {
+ cdf[0] -= cdf[0] >> rate;
+ }
+ cdf[2] += static_cast<uint16_t>(count < 32);
+ }
+ return symbol;
+}
+
+bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+ return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
+ static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+ if (symbol_count == 3 || symbol_count == 4) {
+ return ReadSymbol3Or4(cdf, symbol_count);
+ }
+ int symbol;
+ if (symbol_count == 8) {
+ symbol = ReadSymbolImpl8(cdf);
+ } else if (symbol_count <= 13) {
+ symbol = ReadSymbolImpl(cdf, symbol_count);
+ } else {
+ symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+ }
+ if (allow_update_cdf_) {
+ if (symbol_count == 5) {
+ UpdateCdf5(cdf, symbol);
+ } else if (symbol_count == 7) {
+ UpdateCdf7(cdf, symbol);
+ } else if (symbol_count == 8) {
+ UpdateCdf8(cdf, symbol);
+ } else if (symbol_count == 9) {
+ UpdateCdf9(cdf, symbol);
+ } else if (symbol_count == 11) {
+ UpdateCdf11(cdf, symbol);
+ } else if (symbol_count == 13) {
+ UpdateCdf13(cdf, symbol);
+ } else if (symbol_count == 16) {
+ UpdateCdf16(cdf, symbol);
+ } else {
+ UpdateCdf(cdf, symbol_count, symbol);
+ }
+ }
+ return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
+ int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ --symbol_count;
+ uint32_t curr = values_in_range_;
+ int symbol = -1;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over.
+ do {
+ prev = curr;
+ curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+ delta;
+ delta -= kMinimumProbabilityPerSymbol;
+ } while (symbol_value < curr);
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImplBinarySearch(
+ const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ assert(symbol_count > 1 && symbol_count <= 16);
+ --symbol_count;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+ // search to do this. Let |symbol| be the index of the first |cdf| array
+ // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+ // binary search maintains the invariant:
+ // low <= symbol <= high + 1
+ // and terminates when low == high + 1.
+ int low = 0;
+ int high = symbol_count - 1;
+ // The binary search maintains the invariants that |prev| is the scaled cdf
+ // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+ // convention, the scaled cdf value for -1 is values_in_range_.) When the
+ // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+ // and |curr| is the scaled cdf value for |symbol|.
+ uint32_t prev = values_in_range_;
+ uint32_t curr = 0;
+ const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+ do {
+ const int mid = DivideBy2(low + high);
+ const uint32_t scaled_cdf =
+ ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+ if (symbol_value < scaled_cdf) {
+ low = mid + 1;
+ prev = scaled_cdf;
+ } else {
+ high = mid - 1;
+ curr = scaled_cdf;
+ }
+ } while (low <= high);
+ assert(low == high + 1);
+ // At this point, |low| is the symbol that has been decoded.
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return low;
+}
+
+int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ const uint32_t curr =
+ (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+ kMinimumProbabilityPerSymbol;
+ const int symbol = static_cast<int>(symbol_value < curr);
+ if (symbol == 1) {
+ values_in_range_ = curr;
+ } else {
+ values_in_range_ -= curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ }
+ NormalizeRange();
+ return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
+ const int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
+ uint32_t curr = values_in_range_;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+ const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+ // array.
+ //
+ // The original code is:
+ //
+ // int symbol = -1;
+ // do {
+ // prev = curr;
+ // curr =
+ // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+ // + delta;
+ // delta -= kMinimumProbabilityPerSymbol;
+ // } while (symbol_value < curr);
+ // if (allow_update_cdf_) {
+ // UpdateCdf(cdf, [3,4], symbol);
+ // }
+ //
+ // The do-while loop is unrolled with three or four iterations, and the
+ // UpdateCdf call is inlined and merged into the iterations.
+ int symbol = 0;
+ // Iteration 0.
+ prev = curr;
+ curr =
+ ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) {
+ // symbol == 0.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+ // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+ // NEON code is slower. Consider using the C version if __arm__ is
+ // defined.
+ // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+ // Samsung Galaxy S8+ (SM-G955FD).
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+ cdf_vec = vsub_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ cdf[2] -= cdf[2] >> rate;
+#endif
+ } else { // symbol_count == 3.
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
+ }
+ goto found;
+ }
+ ++symbol;
+ delta -= kMinimumProbabilityPerSymbol;
+ // Iteration 1.
+ prev = curr;
+ curr =
+ ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) {
+ // symbol == 1.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+ }
+ goto found;
+ }
+ ++symbol;
+ if (symbol_count == 4) {
+ delta -= kMinimumProbabilityPerSymbol;
+ // Iteration 2.
+ prev = curr;
+ curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+ delta;
+ if (symbol_value >= curr) {
+ // symbol == 2.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+ const uint16_t count = cdf[4];
+ cdf[4] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ cdf[2] -= cdf[2] >> rate;
+ }
+ goto found;
+ }
+ ++symbol;
+ }
+ // |delta| is 0 for the last iteration.
+ // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+ prev = curr;
+ // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+ curr = 0;
+ // symbol == [2,3].
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+ // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+ // code is a tiny bit slower. Consider using the C version if __arm__ is
+ // defined.
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+ const int16x4_t diff =
+ vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta =
+ vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+ cdf_vec = vadd_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+ cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+ cdf[3] = 0;
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+ } else { // symbol_count == 3.
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ }
+ }
+found:
+ // End of unrolled do-while loop.
+
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl8(
+ const uint16_t* LIBGAV1_RESTRICT const cdf) {
+ assert(cdf[7] == 0);
+ uint32_t curr = values_in_range_;
+ uint32_t prev;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+ // Search through the |cdf| array to determine where the scaled cdf value and
+ // |symbol_value| cross over.
+ //
+ // The original code is:
+ //
+ // int symbol = -1;
+ // do {
+ // prev = curr;
+ // curr =
+ // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+ // + delta;
+ // delta -= kMinimumProbabilityPerSymbol;
+ // } while (symbol_value < curr);
+ //
+ // The do-while loop is unrolled with eight iterations.
+ int symbol = 0;
+
+#define READ_SYMBOL_ITERATION \
+ prev = curr; \
+ curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+ delta; \
+ if (symbol_value >= curr) goto found; \
+ ++symbol; \
+ delta -= kMinimumProbabilityPerSymbol
+
+ READ_SYMBOL_ITERATION; // Iteration 0.
+ READ_SYMBOL_ITERATION; // Iteration 1.
+ READ_SYMBOL_ITERATION; // Iteration 2.
+ READ_SYMBOL_ITERATION; // Iteration 3.
+ READ_SYMBOL_ITERATION; // Iteration 4.
+ READ_SYMBOL_ITERATION; // Iteration 5.
+
+ // The last two iterations can be simplified, so they don't use the
+ // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+ // Iteration 6.
+ prev = curr;
+ curr =
+ (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+ if (symbol_value >= curr) goto found; // symbol == 6.
+ ++symbol;
+ // |delta| is 0 for the last iteration.
+ // Iteration 7.
+ prev = curr;
+ // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+ curr = 0;
+ // symbol == 7.
+found:
+ // End of unrolled do-while loop.
+
+ values_in_range_ = prev - curr;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+ NormalizeRange();
+ return symbol;
+}
+
+void EntropyDecoder::PopulateBits() {
+ constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+ // Fast path: read eight bytes and add the first six bytes to window_diff_.
+ // This fast path makes the following assumptions.
+ // 1. We assume that unaligned load of uint64_t is fast.
+ // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+ // bytes depending on the value of bits_. This fast path always reads 6
+ // bytes, which results in more calls to PopulateBits(). We assume that
+ // making more calls to a faster PopulateBits() is overall a win.
+ // NOTE: Although this fast path could also be used on x86_64, it hurts
+ // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+ // reason is still unknown.) Therefore this fast path is only used on arm64.
+ static_assert(kWindowSize == 64, "");
+ if (data_ < data_memcpy_end_) {
+ uint64_t value;
+ // arm64 supports unaligned loads, so this memcpy call is compiled to a
+ // single ldr instruction.
+ memcpy(&value, data_, sizeof(value));
+ data_ += kMaxCachedBits >> 3;
+ value = HostToBigEndian(value) ^ -1;
+ value >>= kWindowSize - kMaxCachedBits;
+ window_diff_ = value | (window_diff_ << kMaxCachedBits);
+ bits_ += kMaxCachedBits;
+ return;
+ }
+#endif
+
+ const uint8_t* data = data_;
+ int bits = bits_;
+ WindowSize window_diff = window_diff_;
+
+ int count = kWindowSize - 9 - (bits + 15);
+ // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+ // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+ // iterations when WindowSize is 64 bits. So it is not profitable to
+ // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+ // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+ for (; count >= 0 && data < data_end_; count -= 8) {
+ const uint8_t value = *data++ ^ -1;
+ window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+ bits += 8;
+ }
+ assert(bits <= kMaxCachedBits);
+ if (data == data_end_) {
+ // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+ window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+ bits = kMaxCachedBits;
+ }
+
+ data_ = data;
+ bits_ = bits;
+ window_diff_ = window_diff;
+}
+
+void EntropyDecoder::NormalizeRange() {
+ const int bits_used = 15 ^ FloorLog2(values_in_range_);
+ bits_ -= bits_used;
+ values_in_range_ <<= bits_used;
+ if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+class EntropyDecoder final : public BitReader {
+ public:
+ // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+ // largest type with fast arithmetic. size_t should meet these requirements.
+ using WindowSize = size_t;
+
+ EntropyDecoder(const uint8_t* data, size_t size, bool allow_update_cdf);
+ ~EntropyDecoder() override = default;
+
+ // Move only.
+ EntropyDecoder(EntropyDecoder&& rhs) noexcept;
+ EntropyDecoder& operator=(EntropyDecoder&& rhs) noexcept;
+
+ int ReadBit() override;
+ int64_t ReadLiteral(int num_bits) override;
+ // ReadSymbol() calls for which the |symbol_count| is only known at runtime
+ // will use this variant.
+ int ReadSymbol(uint16_t* cdf, int symbol_count);
+ // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
+ // symbols) will use this variant.
+ bool ReadSymbol(uint16_t* cdf);
+ bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
+ // Use either linear search or binary search for decoding the symbol depending
+ // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
+ // at compile time will use this variant.
+ template <int symbol_count>
+ int ReadSymbol(uint16_t* cdf);
+
+ private:
+ static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
+ static_assert(kWindowSize >= 32, "");
+
+ // Reads a symbol using the |cdf| table which contains the probabilities of
+ // each symbol. On a high level, this function does the following:
+ // 1) Scale the |cdf| values.
+ // 2) Find the index in the |cdf| array where the scaled CDF value crosses
+ // the modified |window_diff_| threshold.
+ // 3) That index is the symbol that has been decoded.
+ // 4) Update |window_diff_| and |values_in_range_| based on the symbol that
+ // has been decoded.
+ inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count);
+ // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in
+ // the comment above. As of now, this function is called when |symbol_count|
+ // is greater than or equal to 14.
+ inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
+ // Specialized implementation of ReadSymbolImpl based on the fact that
+ // symbol_count == 2.
+ inline int ReadSymbolImpl(uint16_t cdf);
+ // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
+ LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
+ // ReadSymbolImplN is a specialization of ReadSymbolImpl for
+ // symbol_count == N.
+ LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
+ inline void PopulateBits();
+ // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also
+ // calls PopulateBits() if necessary.
+ inline void NormalizeRange();
+
+ const uint8_t* data_;
+ const uint8_t* const data_end_;
+ // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+ // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+ // constructor, not PopulateBits().
+ const uint8_t* const data_memcpy_end_;
+ const bool allow_update_cdf_;
+ // Number of cached bits of data in the current value.
+ int bits_;
+ // Number of values in the current range. Declared as uint32_t for better
+ // performance but only the lower 16 bits are used.
+ uint32_t values_in_range_;
+ // The difference between the high end of the current range and the coded
+ // value minus 1. The 16 bits above |bits_| of this variable are used to
+ // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+ // Note this implementation differs from the spec as it trades the need to
+ // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+ // which occurs less frequently.
+ WindowSize window_diff_;
+};
+
+extern template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cstdint>
+#include <cstdio>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+#include "src/utils/entropy_decoder_test_data.inc"
+
+class EntropyDecoderTest : public testing::Test {
+ protected:
+ // If compile_time is true, tests
+ // bool EntropyDecoder::ReadSymbol(uint16_t* cdf).
+ // Otherwise, tests
+ // int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+ // with symbol_count=2.
+ template <bool compile_time>
+ void TestReadSymbolBoolean(int num_runs);
+
+ // For N = 3..16 (except 15):
+ // template <bool compile_time>
+ // void TestReadSymbolN(int num_runs);
+ //
+ // If compile_time is true, tests
+ // int EntropyDecoder::ReadSymbol<N>(uint16_t* const cdf).
+ // Otherwise, tests
+ // int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+ // with symbol_count=N.
+ //
+ // NOTE: symbol_count=15 is not tested because AV1 does not use it.
+ template <bool compile_time>
+ void TestReadSymbol3(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol4(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol5(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol6(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol7(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol8(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol9(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol10(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol11(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol12(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol13(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol14(int num_runs);
+
+ template <bool compile_time>
+ void TestReadSymbol16(int num_runs);
+};
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbolBoolean(int num_runs) {
+ static constexpr int kSymbols[4][4] = {{0, 0, 1, 1}, //
+ {0, 1, 1, 0}, //
+ {1, 0, 1, 0}, //
+ {1, 0, 0, 1}};
+ absl::Duration elapsed_time;
+ bool symbols[1024 * 4 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbolBoolean,
+ kNumBytesTestReadSymbolBoolean,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][3] = {
+ {16384, 0, 0},
+ {32768 - 8386, 0, 0},
+ {32768 - 24312, 0, 0},
+ {16384, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 2) != 0;
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbolBooleanCompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbolBoolean(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 4; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol3(int num_runs) {
+ static constexpr int kSymbols[6][4] = {{0, 2, 1, 2}, //
+ {1, 1, 2, 1}, //
+ {2, 0, 0, 0}, //
+ {0, 2, 0, 2}, //
+ {1, 2, 1, 0}, //
+ {2, 1, 1, 0}};
+ absl::Duration elapsed_time;
+ int symbols[1024 * 6 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol3, kNumBytesTestReadSymbol3,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][4] = {
+ // pdf: 1/3, 1/3, 1/3
+ {32768 - 10923, 32768 - 21845, 0, 0},
+ // pdf: 1/6, 2/6, 3/6
+ {32768 - 5461, 32768 - 16384, 0, 0},
+ // pdf: 2/6, 3/6, 1/6
+ {32768 - 10923, 32768 - 27307, 0, 0},
+ // pdf: 3/6, 1/6, 2/6
+ {32768 - 16384, 32768 - 21845, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 6; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<3>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 3);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol3CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol3(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 6; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol4(int num_runs) {
+ static constexpr int kSymbols[8][4] = {{0, 0, 3, 3}, //
+ {0, 0, 2, 2}, //
+ {1, 1, 0, 0}, //
+ {1, 2, 1, 1}, //
+ {2, 2, 3, 2}, //
+ {2, 3, 2, 1}, //
+ {3, 3, 0, 0}, //
+ {3, 3, 1, 1}};
+ absl::Duration elapsed_time;
+ int symbols[1024 * 8 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol4, kNumBytesTestReadSymbol4,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][5] = {
+ // pdf: 1/4, 1/4, 1/4, 1/4
+ {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+ // pdf: 2/8, 1/8, 2/8, 3/8
+ {32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0},
+ // pdf: 1/4, 1/4, 1/4, 1/4
+ {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+ // pdf: 2/8, 3/8, 2/8, 1/8
+ {32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<4>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 4);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol4CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol4(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 8; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol5(int num_runs) {
+ static constexpr int kSymbols[10][4] = {{0, 0, 4, 4}, //
+ {0, 1, 3, 3}, //
+ {1, 2, 2, 2}, //
+ {1, 3, 1, 1}, //
+ {2, 4, 0, 0}, //
+ {2, 0, 4, 3}, //
+ {3, 1, 3, 2}, //
+ {3, 2, 2, 1}, //
+ {4, 3, 1, 2}, //
+ {4, 0, 4, 2}};
+ absl::Duration elapsed_time;
+ int symbols[320 * 10 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol5, kNumBytesTestReadSymbol5,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][6] = {
+ // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+ {32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0},
+ // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+ {32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0},
+ // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+ {32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0},
+ // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+ {32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 320; ++i) {
+ for (int j = 0; j < 10; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<5>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 5);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol5CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol5(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 320; ++i) {
+ for (int j = 0; j < 10; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol6(int num_runs) {
+ static constexpr int kSymbols[12][4] = {{0, 0, 5, 5}, //
+ {0, 1, 4, 4}, //
+ {1, 2, 3, 3}, //
+ {1, 3, 2, 2}, //
+ {2, 4, 1, 1}, //
+ {2, 5, 0, 0}, //
+ {3, 0, 5, 4}, //
+ {3, 1, 4, 3}, //
+ {4, 2, 3, 2}, //
+ {4, 3, 2, 1}, //
+ {5, 4, 1, 3}, //
+ {5, 0, 5, 2}};
+ absl::Duration elapsed_time;
+ int symbols[256 * 12 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol6, kNumBytesTestReadSymbol6,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][7] = {
+ // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+ {32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845,
+ 32768 - 27307, 0, 0},
+ // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+ {32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+ 32768 - 30037, 0, 0},
+ // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+ {32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115,
+ 32768 - 24576, 0, 0},
+ // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+ {32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576,
+ 32768 - 30037, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 12; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<6>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 6);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol6CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol6(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 12; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol7(int num_runs) {
+ static constexpr int kSymbols[14][4] = {{0, 4, 6, 3}, //
+ {1, 5, 5, 2}, //
+ {2, 6, 4, 1}, //
+ {3, 0, 3, 0}, //
+ {4, 1, 2, 6}, //
+ {5, 2, 1, 5}, //
+ {6, 3, 0, 4}, //
+ {0, 0, 6, 5}, //
+ {2, 1, 4, 3}, //
+ {4, 3, 6, 1}, //
+ {6, 5, 2, 4}, //
+ {1, 0, 5, 2}, //
+ {3, 2, 3, 2}, //
+ {5, 4, 5, 3}};
+ absl::Duration elapsed_time;
+ int symbols[1024 * 14 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol7, kNumBytesTestReadSymbol7,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][8] = {
+ // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+ {32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+ 32768 - 23406, 32768 - 28087, 0, 0},
+ // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+ {32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+ 32768 - 25746, 32768 - 30427, 0, 0},
+ // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+ {32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+ 32768 - 25746, 0, 0},
+ // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+ {32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+ 32768 - 25746, 32768 - 30427, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 14; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<7>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 7);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol7CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol7(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 14; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol8(int num_runs) {
+ static constexpr int kSymbols[16][4] = {{0, 4, 7, 3}, //
+ {1, 5, 6, 2}, //
+ {2, 6, 5, 1}, //
+ {3, 7, 4, 0}, //
+ {4, 0, 3, 7}, //
+ {5, 1, 2, 6}, //
+ {6, 2, 1, 5}, //
+ {7, 3, 0, 4}, //
+ {0, 0, 6, 5}, //
+ {2, 1, 4, 3}, //
+ {4, 3, 6, 4}, //
+ {6, 5, 2, 2}, //
+ {1, 0, 7, 3}, //
+ {3, 2, 5, 5}, //
+ {5, 4, 7, 2}, //
+ {7, 6, 3, 4}};
+ absl::Duration elapsed_time;
+ int symbols[1024 * 16 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol8, kNumBytesTestReadSymbol8,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][9] = {
+ // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+ {32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+ 32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0},
+ // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+ {32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+ 32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0},
+ // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+ {32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+ 32768 - 20480, 32768 - 26624, 0, 0},
+ // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+ {32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+ 32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 16; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<8>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 8);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol8CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol8(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 1024; ++i) {
+ for (int j = 0; j < 16; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol9(int num_runs) {
+ static constexpr int kSymbols[18][4] = {{0, 4, 8, 3}, //
+ {1, 5, 7, 2}, //
+ {2, 6, 6, 1}, //
+ {3, 7, 5, 0}, //
+ {4, 8, 4, 8}, //
+ {5, 0, 3, 7}, //
+ {6, 1, 2, 6}, //
+ {7, 2, 1, 5}, //
+ {8, 3, 0, 4}, //
+ {0, 0, 8, 7}, //
+ {2, 1, 6, 5}, //
+ {4, 3, 4, 3}, //
+ {6, 5, 2, 1}, //
+ {8, 7, 7, 6}, //
+ {1, 0, 5, 4}, //
+ {3, 2, 3, 2}, //
+ {5, 4, 1, 4}, //
+ {7, 6, 8, 4}};
+ absl::Duration elapsed_time;
+ int symbols[128 * 18 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol9, kNumBytesTestReadSymbol9,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][10] = {
+ // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+ {32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564,
+ 32768 - 18204, 32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0},
+ // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+ {32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+ 32768 - 20025, 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+ // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+ {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+ 32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0},
+ // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+ {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+ 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 128; ++i) {
+ for (int j = 0; j < 18; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<9>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 9);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol9CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol9(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 128; ++i) {
+ for (int j = 0; j < 18; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol10(int num_runs) {
+ static constexpr int kSymbols[20][4] = {{0, 5, 9, 4}, //
+ {1, 6, 8, 3}, //
+ {2, 7, 7, 2}, //
+ {3, 8, 6, 1}, //
+ {4, 9, 5, 0}, //
+ {5, 0, 4, 9}, //
+ {6, 1, 3, 8}, //
+ {7, 2, 2, 7}, //
+ {8, 3, 1, 6}, //
+ {9, 4, 0, 5}, //
+ {0, 0, 9, 7}, //
+ {2, 1, 8, 5}, //
+ {4, 3, 6, 3}, //
+ {6, 5, 4, 1}, //
+ {8, 7, 2, 8}, //
+ {1, 0, 9, 6}, //
+ {3, 2, 7, 4}, //
+ {5, 4, 5, 2}, //
+ {7, 6, 3, 5}, //
+ {9, 8, 1, 4}};
+ absl::Duration elapsed_time;
+ int symbols[96 * 20 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol10, kNumBytesTestReadSymbol10,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][11] = {
+ // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+ {32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+ 32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0},
+ // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+ {32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+ 32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853,
+ 32768 - 31130, 0, 0},
+ // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+ {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+ 32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0},
+ // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+ {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+ 32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 96; ++i) {
+ for (int j = 0; j < 20; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<10>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 10);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol10CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol10(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 96; ++i) {
+ for (int j = 0; j < 20; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol11(int num_runs) {
+ static constexpr int kSymbols[22][4] = {{0, 6, 10, 5}, //
+ {1, 7, 9, 4}, //
+ {2, 8, 8, 3}, //
+ {3, 9, 7, 2}, //
+ {4, 10, 6, 1}, //
+ {5, 0, 5, 0}, //
+ {6, 1, 4, 10}, //
+ {7, 2, 3, 9}, //
+ {8, 3, 2, 8}, //
+ {9, 4, 1, 7}, //
+ {10, 5, 0, 6}, //
+ {0, 0, 10, 9}, //
+ {2, 1, 8, 7}, //
+ {4, 3, 6, 5}, //
+ {6, 5, 4, 3}, //
+ {8, 7, 2, 1}, //
+ {10, 9, 10, 8}, //
+ {1, 0, 9, 6}, //
+ {3, 2, 7, 4}, //
+ {5, 4, 5, 2}, //
+ {7, 6, 3, 5}, //
+ {9, 8, 1, 5}};
+ absl::Duration elapsed_time;
+ int symbols[96 * 22 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol11, kNumBytesTestReadSymbol11,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][12] = {
+ // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+ {32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+ 32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+ 32768 - 29789, 0, 0},
+ // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+ {32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+ 32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+ 32768 - 28300, 32768 - 31279, 0, 0},
+ // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+ {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+ 32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+ 32768 - 28300, 0, 0},
+ // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+ {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+ 32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+ 32768 - 31279, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 96; ++i) {
+ for (int j = 0; j < 22; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<11>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 11);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol11CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol11(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 96; ++i) {
+ for (int j = 0; j < 22; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol12(int num_runs) {
+ static constexpr int kSymbols[24][4] = {{0, 6, 11, 5}, //
+ {1, 7, 10, 4}, //
+ {2, 8, 9, 3}, //
+ {3, 9, 8, 2}, //
+ {4, 10, 7, 1}, //
+ {5, 11, 6, 0}, //
+ {6, 0, 5, 11}, //
+ {7, 1, 4, 10}, //
+ {8, 2, 3, 9}, //
+ {9, 3, 2, 8}, //
+ {10, 4, 1, 7}, //
+ {11, 5, 0, 6}, //
+ {0, 0, 11, 9}, //
+ {2, 1, 10, 7}, //
+ {4, 3, 8, 5}, //
+ {6, 5, 6, 3}, //
+ {8, 7, 4, 1}, //
+ {10, 9, 2, 10}, //
+ {1, 0, 11, 8}, //
+ {3, 2, 9, 6}, //
+ {5, 4, 7, 4}, //
+ {7, 6, 5, 2}, //
+ {9, 8, 3, 6}, //
+ {11, 10, 1, 5}};
+ absl::Duration elapsed_time;
+ int symbols[80 * 24 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol12, kNumBytesTestReadSymbol12,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][13] = {
+ // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+ // 1/12,
+ // 1/12
+ {32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+ 32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+ 32768 - 27307, 32768 - 30037, 0, 0},
+ // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+ // 2/24,
+ // 1/24
+ {32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+ 32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+ 32768 - 28672, 32768 - 31403, 0, 0},
+ // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+ // 2/24,
+ // 3/24
+ {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+ 32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+ 32768 - 25941, 32768 - 28672, 0, 0},
+ // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24,
+ // 2/24,
+ // 1/24
+ {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+ 32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+ 32768 - 28672, 32768 - 31403, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 80; ++i) {
+ for (int j = 0; j < 24; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<12>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 12);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol12CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol12(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 80; ++i) {
+ for (int j = 0; j < 24; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol13(int num_runs) {
+ static constexpr int kSymbols[26][4] = {{0, 6, 12, 5}, //
+ {1, 7, 11, 4}, //
+ {2, 8, 10, 3}, //
+ {3, 9, 9, 2}, //
+ {4, 10, 8, 1}, //
+ {5, 11, 7, 0}, //
+ {6, 12, 6, 12}, //
+ {7, 0, 5, 11}, //
+ {8, 1, 4, 10}, //
+ {9, 2, 3, 9}, //
+ {10, 3, 2, 8}, //
+ {11, 4, 1, 7}, //
+ {12, 5, 0, 6}, //
+ {0, 0, 12, 11}, //
+ {2, 1, 10, 9}, //
+ {4, 3, 8, 7}, //
+ {6, 5, 6, 5}, //
+ {8, 7, 4, 3}, //
+ {10, 9, 2, 1}, //
+ {12, 11, 12, 10}, //
+ {1, 0, 11, 8}, //
+ {3, 2, 9, 6}, //
+ {5, 4, 7, 4}, //
+ {7, 6, 5, 2}, //
+ {9, 8, 3, 6}, //
+ {11, 10, 1, 6}};
+ absl::Duration elapsed_time;
+ int symbols[64 * 26 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol13, kNumBytesTestReadSymbol13,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][14] = {
+ // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+ // 1/13, 1/13, 1/13
+ {32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+ 32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+ 32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0},
+ // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+ // 2/26, 2/26, 1/26
+ {32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+ 32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+ 32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+ // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+ // 2/26, 2/26, 3/26
+ {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+ 32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+ 32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0},
+ // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26,
+ // 2/26, 2/26, 1/26
+ {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+ 32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+ 32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 64; ++i) {
+ for (int j = 0; j < 26; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<13>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 13);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol13CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol13(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 64; ++i) {
+ for (int j = 0; j < 26; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol14(int num_runs) {
+ static constexpr int kSymbols[28][4] = {{0, 7, 13, 6}, //
+ {1, 8, 12, 5}, //
+ {2, 9, 11, 4}, //
+ {3, 10, 10, 3}, //
+ {4, 11, 9, 2}, //
+ {5, 12, 8, 1}, //
+ {6, 13, 7, 0}, //
+ {7, 0, 6, 13}, //
+ {8, 1, 5, 12}, //
+ {9, 2, 4, 11}, //
+ {10, 3, 3, 10}, //
+ {11, 4, 2, 9}, //
+ {12, 5, 1, 8}, //
+ {13, 6, 0, 7}, //
+ {0, 0, 13, 11}, //
+ {2, 1, 12, 9}, //
+ {4, 3, 10, 7}, //
+ {6, 5, 8, 5}, //
+ {8, 7, 6, 3}, //
+ {10, 9, 4, 1}, //
+ {12, 11, 2, 12}, //
+ {1, 0, 13, 10}, //
+ {3, 2, 11, 8}, //
+ {5, 4, 9, 6}, //
+ {7, 6, 7, 4}, //
+ {9, 8, 5, 2}, //
+ {11, 10, 3, 7}, //
+ {13, 12, 1, 6}};
+ absl::Duration elapsed_time;
+ int symbols[64 * 28 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol14, kNumBytesTestReadSymbol14,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][15] = {
+ // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+ // 1/14, 1/14, 1/14, 1/14
+ {32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+ 32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+ 32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0},
+ // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+ // 2/28, 2/28, 2/28, 1/28
+ {32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+ 32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+ 32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+ // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+ // 2/28, 2/28, 2/28, 3/28
+ {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+ 32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+ 32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0},
+ // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28,
+ // 2/28, 2/28, 2/28, 1/28
+ {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+ 32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+ 32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 64; ++i) {
+ for (int j = 0; j < 28; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<14>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 14);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol14CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol14(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 64; ++i) {
+ for (int j = 0; j < 28; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol16(int num_runs) {
+ static constexpr int kSymbols[32][4] = {{0, 8, 15, 7}, //
+ {1, 9, 14, 6}, //
+ {2, 10, 13, 5}, //
+ {3, 11, 12, 4}, //
+ {4, 12, 11, 3}, //
+ {5, 13, 10, 2}, //
+ {6, 14, 9, 1}, //
+ {7, 15, 8, 0}, //
+ {8, 0, 7, 15}, //
+ {9, 1, 6, 14}, //
+ {10, 2, 5, 13}, //
+ {11, 3, 4, 12}, //
+ {12, 4, 3, 11}, //
+ {13, 5, 2, 10}, //
+ {14, 6, 1, 9}, //
+ {15, 7, 0, 8}, //
+ {0, 0, 15, 13}, //
+ {2, 1, 14, 11}, //
+ {4, 3, 12, 9}, //
+ {6, 5, 10, 7}, //
+ {8, 7, 8, 5}, //
+ {10, 9, 6, 3}, //
+ {12, 11, 4, 1}, //
+ {14, 13, 2, 14}, //
+ {1, 0, 15, 12}, //
+ {3, 2, 13, 10}, //
+ {5, 4, 11, 8}, //
+ {7, 6, 9, 6}, //
+ {9, 8, 7, 4}, //
+ {11, 10, 5, 2}, //
+ {13, 12, 3, 8}, //
+ {15, 14, 1, 7}};
+ absl::Duration elapsed_time;
+ int symbols[48 * 32 * 4];
+ for (int run = 0; run < num_runs; ++run) {
+ EntropyDecoder reader(kBytesTestReadSymbol16, kNumBytesTestReadSymbol16,
+ /*allow_update_cdf=*/true);
+ uint16_t cdf[4][17] = {
+ // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+ // 1/16, 1/16, 1/16, 1/16, 1/16, 1/16
+ {32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+ 32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+ 32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+ 32768 - 28672, 32768 - 30720, 0, 0},
+ // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+ // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+ {32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+ 32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+ 32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+ 32768 - 29696, 32768 - 31744, 0, 0},
+ // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+ // 2/32, 2/32, 2/32, 2/32, 2/32, 3/32
+ {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+ 32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+ 32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+ 32768 - 27648, 32768 - 29696, 0, 0},
+ // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32,
+ // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+ {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+ 32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+ 32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+ 32768 - 29696, 32768 - 31744, 0, 0},
+ };
+ const absl::Time start = absl::Now();
+ int index = 0;
+ for (int i = 0; i < 48; ++i) {
+ for (int j = 0; j < 32; ++j) {
+ for (int k = 0; k < 4; ++k) { // NOLINT(modernize-loop-convert)
+ if (compile_time) {
+ symbols[index++] = reader.ReadSymbol<16>(cdf[k]);
+ } else {
+ symbols[index++] = reader.ReadSymbol(cdf[k], 16);
+ }
+ }
+ }
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ if (compile_time) {
+ printf("TestReadSymbol16CompileTime(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ } else {
+ printf("TestReadSymbol16(%d): %5d us\n", num_runs,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+
+ int index = 0;
+ for (int i = 0; i < 48; ++i) {
+ for (int j = 0; j < 32; ++j) { // NOLINT(modernize-loop-convert)
+ for (int k = 0; k < 4; ++k) {
+ ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+ }
+ }
+ }
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBoolean) {
+ TestReadSymbolBoolean</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBooleanCompileTime) {
+ TestReadSymbolBoolean</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3) {
+ TestReadSymbol3</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3CompileTime) {
+ TestReadSymbol3</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4) {
+ TestReadSymbol4</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4CompileTime) {
+ TestReadSymbol4</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5) {
+ TestReadSymbol5</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5CompileTime) {
+ TestReadSymbol5</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6) {
+ TestReadSymbol6</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6CompileTime) {
+ TestReadSymbol6</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7) {
+ TestReadSymbol7</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7CompileTime) {
+ TestReadSymbol7</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8) {
+ TestReadSymbol8</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8CompileTime) {
+ TestReadSymbol8</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9) {
+ TestReadSymbol9</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9CompileTime) {
+ TestReadSymbol9</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10) {
+ TestReadSymbol10</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10CompileTime) {
+ TestReadSymbol10</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11) {
+ TestReadSymbol11</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11CompileTime) {
+ TestReadSymbol11</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12) {
+ TestReadSymbol12</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12CompileTime) {
+ TestReadSymbol12</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13) {
+ TestReadSymbol13</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13CompileTime) {
+ TestReadSymbol13</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14) {
+ TestReadSymbol14</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14CompileTime) {
+ TestReadSymbol14</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16) {
+ TestReadSymbol16</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16CompileTime) {
+ TestReadSymbol16</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, DISABLED_Speed) {
+ // compile_time=true is only tested for those symbol_count values that have
+ // an instantiation of the EntropyDecoder::ReadSymbol<symbol_count> template
+ // method.
+ TestReadSymbolBoolean</*compile_time=*/false>(10000);
+ TestReadSymbolBoolean</*compile_time=*/true>(10000);
+ TestReadSymbol3</*compile_time=*/false>(5000);
+ TestReadSymbol3</*compile_time=*/true>(5000);
+ TestReadSymbol4</*compile_time=*/false>(2000);
+ TestReadSymbol4</*compile_time=*/true>(2000);
+ TestReadSymbol5</*compile_time=*/false>(5000);
+ TestReadSymbol5</*compile_time=*/true>(5000);
+ TestReadSymbol6</*compile_time=*/false>(5000);
+ TestReadSymbol6</*compile_time=*/true>(5000);
+ TestReadSymbol7</*compile_time=*/false>(1000);
+ TestReadSymbol7</*compile_time=*/true>(1000);
+ TestReadSymbol8</*compile_time=*/false>(1000);
+ TestReadSymbol8</*compile_time=*/true>(1000);
+ TestReadSymbol9</*compile_time=*/false>(5000);
+ TestReadSymbol9</*compile_time=*/true>(5000);
+ TestReadSymbol10</*compile_time=*/false>(5000);
+ TestReadSymbol10</*compile_time=*/true>(5000);
+ TestReadSymbol11</*compile_time=*/false>(5000);
+ TestReadSymbol11</*compile_time=*/true>(5000);
+ TestReadSymbol12</*compile_time=*/false>(5000);
+ TestReadSymbol12</*compile_time=*/true>(5000);
+ TestReadSymbol13</*compile_time=*/false>(5000);
+ TestReadSymbol13</*compile_time=*/true>(5000);
+ TestReadSymbol14</*compile_time=*/false>(5000);
+ TestReadSymbol14</*compile_time=*/true>(5000);
+ TestReadSymbol16</*compile_time=*/false>(5000);
+ TestReadSymbol16</*compile_time=*/true>(5000);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The kBytesTestReadSymbolBoolean[] array was encoded by using the following
+// libaom code:
+//
+// aom_cdf_prob cdf[4][3] = {
+// { 16384, 0, 0 },
+// { 32768 - 8386, 0, 0 },
+// { 32768 - 24312, 0, 0 },
+// { 16384, 0, 0 },
+// };
+// constexpr int kSymbols[4][4] = { { 0, 0, 1, 1 }, //
+// { 0, 1, 1, 0 }, //
+// { 1, 0, 1, 0 }, //
+// { 1, 0, 0, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+// for (int j = 0; j < 4; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 2);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf(" constexpr size_t kNumBytesTestReadSymbolBoolean = %u;\n", bw.pos);
+// printf(" constexpr uint8_t kBytesTestReadSymbolBoolean[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n };\n");
+
+constexpr size_t kNumBytesTestReadSymbolBoolean = 1880;
+constexpr uint8_t kBytesTestReadSymbolBoolean[] = {
+ 0x1e, 0xfe, 0x7c, 0xa2, 0x1e, 0xfc, 0xa1, 0x17, 0xee, 0xbf, 0x07, 0x76,
+ 0x2d, 0x11, 0x3a, 0xa5, 0x49, 0x65, 0xbb, 0x83, 0x89, 0x4b, 0xaa, 0x23,
+ 0x29, 0x0d, 0x81, 0x9f, 0x6a, 0xf2, 0x9f, 0x7e, 0x14, 0x9a, 0x86, 0x78,
+ 0x7f, 0xd5, 0x31, 0x14, 0x45, 0x8e, 0xf5, 0xc3, 0x36, 0x63, 0xcb, 0x4f,
+ 0xeb, 0x81, 0x19, 0x75, 0x3c, 0xda, 0x21, 0x71, 0x1d, 0x05, 0x34, 0x7e,
+ 0x43, 0xd4, 0x5b, 0xeb, 0x0a, 0x6d, 0xbe, 0xd2, 0x8f, 0xa5, 0x8f, 0xac,
+ 0x3b, 0x43, 0xb6, 0x8a, 0xf9, 0x86, 0xf7, 0x1a, 0x3c, 0x4b, 0x2b, 0x4c,
+ 0x4c, 0x4a, 0xff, 0xb9, 0x6f, 0x3c, 0xeb, 0xf6, 0x4c, 0xc8, 0x3c, 0x01,
+ 0x5f, 0x12, 0x76, 0x4f, 0x88, 0xa0, 0xa5, 0xe7, 0x1d, 0xb3, 0x97, 0xd8,
+ 0x31, 0x90, 0x8f, 0xd1, 0x46, 0xfd, 0xf7, 0xb1, 0x02, 0x0d, 0xf3, 0x9e,
+ 0xbe, 0xa2, 0xfb, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x13, 0x59, 0xcd,
+ 0xba, 0xe7, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x0e, 0xc3, 0x7b, 0x63,
+ 0x80, 0xfe, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e,
+ 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd,
+ 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33,
+ 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8,
+ 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30,
+ 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37,
+ 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb,
+ 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3,
+ 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e,
+ 0x85, 0x13, 0x83, 0xe9, 0x58, 0xaf, 0xe8, 0xff, 0x03, 0xb8, 0xf5, 0x08,
+ 0x63, 0x03, 0xea, 0xe9, 0x3a, 0x39, 0x6d, 0xb6, 0x32, 0xc5, 0xff, 0xf7,
+ 0x19, 0x19, 0x9c, 0x29, 0x3a, 0xc5, 0x87, 0x27, 0x2d, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+ 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+ 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+ 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+ 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+ 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+ 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+ 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+ 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+ 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+ 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+ 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xac,
+};
+static_assert(sizeof(kBytesTestReadSymbolBoolean) ==
+ kNumBytesTestReadSymbolBoolean,
+ "");
+
+// The kBytesTestReadSymbol3[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][4] = {
+// // pdf: 1/3, 1/3, 1/3
+// { 32768 - 10923, 32768 - 21845, 0, 0 },
+// // pdf: 1/6, 2/6, 3/6
+// { 32768 - 5461, 32768 - 16384, 0, 0 },
+// // pdf: 2/6, 3/6, 1/6
+// { 32768 - 10923, 32768 - 27307, 0, 0 },
+// // pdf: 3/6, 1/6, 2/6
+// { 32768 - 16384, 32768 - 21845, 0, 0 },
+// };
+// constexpr int kSymbols[6][4] = { { 0, 2, 1, 2 }, //
+// { 1, 1, 2, 1 }, //
+// { 2, 0, 0, 0 }, //
+// { 0, 2, 0, 2 }, //
+// { 1, 2, 1, 0 }, //
+// { 2, 1, 1, 0 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+// for (int j = 0; j < 6; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 3);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf(" constexpr size_t kNumBytesTestReadSymbol3 = %u;\n", bw.pos);
+// printf(" constexpr uint8_t kBytesTestReadSymbol3[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n };\n");
+
+constexpr size_t kNumBytesTestReadSymbol3 = 4646;
+constexpr uint8_t kBytesTestReadSymbol3[] = {
+ 0x4a, 0xf9, 0x1a, 0x00, 0xef, 0x80, 0xd4, 0xcd, 0xc2, 0x55, 0x62, 0x76,
+ 0x3a, 0x60, 0x4e, 0xc9, 0x17, 0x91, 0x86, 0xb0, 0xa0, 0xcb, 0xf7, 0x7e,
+ 0x82, 0x1e, 0x92, 0xd9, 0xe5, 0xff, 0xaa, 0x0b, 0xa4, 0xc1, 0xfa, 0x0d,
+ 0xbe, 0x4f, 0x17, 0x4a, 0xfd, 0xee, 0xb6, 0x9b, 0x57, 0x3e, 0xdb, 0x60,
+ 0x19, 0xd2, 0xee, 0x35, 0x39, 0x73, 0xc9, 0x7b, 0x80, 0xc0, 0x9c, 0x9a,
+ 0xe8, 0x0f, 0x8b, 0xb8, 0x99, 0x02, 0xde, 0x68, 0x97, 0xab, 0xee, 0x2c,
+ 0xa0, 0xb1, 0x7b, 0x8e, 0x8a, 0x69, 0xd5, 0xcd, 0x40, 0x43, 0xa9, 0x4c,
+ 0xd5, 0xac, 0x33, 0x70, 0x64, 0x35, 0xa1, 0x18, 0xde, 0x31, 0x21, 0x2b,
+ 0xa1, 0xd2, 0x87, 0x63, 0x41, 0x4d, 0xd9, 0x0e, 0x17, 0xd8, 0x74, 0x19,
+ 0xbc, 0x33, 0xee, 0xd9, 0x21, 0x22, 0x16, 0xbb, 0x1e, 0x14, 0x46, 0xcf,
+ 0xfa, 0xee, 0xa2, 0xa0, 0xc0, 0x6b, 0xc5, 0xf0, 0xd8, 0x23, 0x6d, 0x20,
+ 0xda, 0x75, 0xff, 0x72, 0x3d, 0x41, 0x51, 0x21, 0x23, 0xa0, 0xce, 0xa0,
+ 0x46, 0xb0, 0x1d, 0x3d, 0xaf, 0x64, 0xf8, 0x57, 0xee, 0x81, 0x55, 0x3a,
+ 0xea, 0xd3, 0x3f, 0x96, 0x52, 0x31, 0xe5, 0xb5, 0x70, 0x01, 0x5a, 0xaf,
+ 0xbc, 0x69, 0x7e, 0x43, 0xdd, 0x2f, 0xe2, 0x40, 0xc7, 0x2d, 0x62, 0x8e,
+ 0xf0, 0x2a, 0xc0, 0x06, 0xe7, 0xe0, 0x63, 0x6e, 0x09, 0xa0, 0x57, 0x83,
+ 0x43, 0x5a, 0xe8, 0xb5, 0xc7, 0x1b, 0xf5, 0xe6, 0x3d, 0x19, 0xeb, 0xfa,
+ 0xda, 0x3d, 0x06, 0x3e, 0xa8, 0x96, 0x09, 0xad, 0x1d, 0xac, 0xf6, 0xef,
+ 0xc7, 0x32, 0x2f, 0x45, 0xe0, 0x4f, 0xa6, 0x9c, 0x2f, 0x66, 0x6b, 0xe3,
+ 0x36, 0xcf, 0x36, 0x41, 0xcb, 0xd9, 0xb8, 0xc3, 0x48, 0xf4, 0x18, 0xfa,
+ 0xa2, 0x58, 0x26, 0xb4, 0x76, 0xb3, 0xdb, 0xbf, 0x1c, 0xc8, 0xbd, 0x19,
+ 0xc1, 0x3e, 0x9a, 0x71, 0x85, 0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf,
+ 0x2f, 0xa0, 0xd1, 0x4b, 0x73, 0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1,
+ 0xda, 0xcf, 0x74, 0x5b, 0xd3, 0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6,
+ 0x15, 0x4a, 0x52, 0x09, 0x22, 0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d,
+ 0xcf, 0x3d, 0xcf, 0xaa, 0x25, 0x82, 0x4f, 0x47, 0x6b, 0x3d, 0xd1, 0x6f,
+ 0x4c, 0x8b, 0xd1, 0x9c, 0x13, 0xe9, 0xa7, 0x18, 0x55, 0x29, 0x48, 0x24,
+ 0x89, 0xc9, 0x0c, 0xf2, 0xfa, 0x0d, 0x14, 0xb7, 0x3c, 0xf7, 0x3e, 0xa8,
+ 0x96, 0x09, 0x3d, 0x1d, 0xac, 0xf7, 0x45, 0xbd, 0x32, 0x2f, 0x46, 0x70,
+ 0x4f, 0xa6, 0x9c, 0x61, 0x54, 0xa5, 0x20, 0x92, 0x27, 0x24, 0x33, 0xcb,
+ 0xe8, 0x34, 0x52, 0xdc, 0xf3, 0xdc, 0xfa, 0xa2, 0x58, 0x24, 0xf4, 0x76,
+ 0xb3, 0xdd, 0x16, 0xf4, 0xc8, 0xbd, 0x19, 0xc1, 0x3e, 0x9a, 0x71, 0x85,
+ 0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf, 0x2f, 0xa0, 0xd1, 0x4b, 0x73,
+ 0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1, 0xda, 0xcf, 0x74, 0x5b, 0xd3,
+ 0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6, 0x15, 0x4a, 0x52, 0x09, 0x22,
+ 0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d, 0xcf, 0x3d, 0xcf, 0xaa, 0x25,
+ 0x84, 0xaa, 0xde, 0xde, 0xba, 0x7e, 0x90, 0x92, 0xa0, 0xdc, 0xb3, 0x6c,
+ 0xaf, 0xe6, 0x2f, 0xeb, 0xc5, 0x33, 0xe7, 0x77, 0xcf, 0xda, 0xe7, 0x31,
+ 0x57, 0xb2, 0x8f, 0xde, 0x8f, 0x1d, 0xf4, 0xd3, 0x8c, 0xda, 0x94, 0xa4,
+ 0x12, 0xcd, 0xc9, 0x32, 0x6d, 0xf7, 0x2d, 0x0c, 0x2c, 0xf9, 0xd8, 0x0b,
+ 0x48, 0xf3, 0xb3, 0x2e, 0x80, 0xd7, 0x0a, 0xc4, 0x4f, 0x09, 0xfe, 0x84,
+ 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+ 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+ 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+ 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+ 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+ 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+ 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+ 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+ 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+ 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+ 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+ 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+ 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+ 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+ 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+ 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+ 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+ 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+ 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+ 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+ 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+ 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+ 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+ 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+ 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+ 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+ 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+ 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+ 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+ 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+ 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+ 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+ 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+ 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+ 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+ 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+ 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+ 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+ 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+ 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+ 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+ 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+ 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+ 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+ 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+ 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+ 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+ 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+ 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+ 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+ 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+ 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+ 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+ 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+ 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+ 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+ 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+ 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+ 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+ 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+ 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+ 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+ 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+ 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+ 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+ 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+ 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+ 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+ 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+ 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+ 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+ 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+ 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+ 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+ 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+ 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+ 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+ 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+ 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+ 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+ 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+ 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+ 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+ 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+ 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+ 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+ 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+ 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+ 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+ 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+ 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+ 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+ 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+ 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+ 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+ 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+ 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+ 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+ 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+ 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+ 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+ 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+ 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+ 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+ 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+ 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+ 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+ 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+ 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+ 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+ 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+ 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+ 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+ 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+ 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+ 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+ 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+ 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+ 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+ 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+ 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+ 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+ 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+ 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+ 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+ 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+ 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+ 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+ 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+ 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+ 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+ 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+ 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+ 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+ 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+ 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+ 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+ 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+ 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+ 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+ 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+ 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+ 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+ 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+ 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+ 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+ 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+ 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+ 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+ 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+ 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+ 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+ 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+ 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+ 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+ 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+ 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+ 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+ 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+ 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+ 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+ 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+ 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+ 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+ 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+ 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+ 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+ 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+ 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+ 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+ 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+ 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+ 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+ 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+ 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+ 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+ 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+ 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+ 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+ 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+ 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+ 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+ 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+ 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+ 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+ 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+ 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+ 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+ 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+ 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+ 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+ 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+ 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+ 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+ 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+ 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+ 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+ 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+ 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+ 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+ 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+ 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+ 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+ 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+ 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+ 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+ 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+ 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+ 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+ 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+ 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+ 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+ 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+ 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+ 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+ 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+ 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+ 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+ 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+ 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+ 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+ 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+ 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+ 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+ 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+ 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+ 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+ 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+ 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+ 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+ 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+ 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+ 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+ 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+ 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+ 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+ 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+ 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+ 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+ 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+ 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+ 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+ 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+ 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+ 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+ 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+ 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+ 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+ 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+ 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+ 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+ 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+ 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+ 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+ 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+ 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+ 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+ 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+ 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+ 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+ 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+ 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+ 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+ 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+ 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+ 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+ 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+ 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+ 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+ 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+ 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+ 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+ 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+ 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+ 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+ 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+ 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+ 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+ 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+ 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+ 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+ 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+ 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+ 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+ 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+ 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+ 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+ 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+ 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+ 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+ 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+ 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+ 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+ 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+ 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+ 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+ 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+ 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+ 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+ 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+ 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+ 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+ 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+ 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+ 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+ 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+ 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+ 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+ 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+ 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+ 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+ 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+ 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+ 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+ 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+ 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+ 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+ 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+ 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+ 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+ 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+ 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+ 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+ 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+ 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+ 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+ 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+ 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+ 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+ 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+ 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+ 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+ 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+ 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+ 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+ 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+ 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+ 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+ 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+ 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+ 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+ 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+ 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+ 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+ 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+ 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+ 0x41, 0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol3) == kNumBytesTestReadSymbol3, "");
+
+// The kBytesTestReadSymbol4[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][5] = {
+// // pdf: 1/4, 1/4, 1/4, 1/4
+// { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+// // pdf: 2/8, 1/8, 2/8, 3/8
+// { 32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0 },
+// // pdf: 1/4, 1/4, 1/4, 1/4
+// { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+// // pdf: 2/8, 3/8, 2/8, 1/8
+// { 32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0 },
+// };
+// constexpr int kSymbols[8][4] = { { 0, 0, 3, 3 }, //
+// { 0, 0, 2, 2 }, //
+// { 1, 1, 0, 0 }, //
+// { 1, 2, 1, 1 }, //
+// { 2, 2, 3, 2 }, //
+// { 2, 3, 2, 1 }, //
+// { 3, 3, 0, 0 }, //
+// { 3, 3, 1, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 4);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf(" constexpr size_t kNumBytesTestReadSymbol4 = %u;\n", bw.pos);
+// printf(" constexpr uint8_t kBytesTestReadSymbol4[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n };\n");
+
+constexpr size_t kNumBytesTestReadSymbol4 = 8055;
+constexpr uint8_t kBytesTestReadSymbol4[] = {
+ 0x0f, 0x9b, 0x2a, 0xf6, 0x38, 0x26, 0xa1, 0xd1, 0x82, 0x5f, 0x34, 0xb5,
+ 0xc7, 0xda, 0x9c, 0xd8, 0x8d, 0x4b, 0xbc, 0x5c, 0x0b, 0x8a, 0x7f, 0x6c,
+ 0x46, 0x3f, 0xa2, 0x03, 0xee, 0x1f, 0xea, 0x25, 0xc7, 0xb7, 0xe2, 0xc9,
+ 0x51, 0x0f, 0x7c, 0x0c, 0xe3, 0x7d, 0x7b, 0xe4, 0xbe, 0xde, 0x41, 0x5c,
+ 0x5a, 0xcf, 0xe6, 0x12, 0x50, 0x7b, 0xcc, 0x83, 0x76, 0x61, 0x03, 0x3a,
+ 0x1e, 0x1b, 0xf8, 0x9d, 0x08, 0x96, 0x98, 0x0f, 0x16, 0xac, 0x7c, 0x25,
+ 0x6c, 0xd1, 0xe8, 0xd8, 0xd6, 0x1c, 0xbd, 0x48, 0xa5, 0x3f, 0xd3, 0x21,
+ 0x4c, 0x4e, 0x94, 0xe3, 0xe3, 0xed, 0x30, 0x70, 0xdb, 0x2e, 0x95, 0xd5,
+ 0x7f, 0xfe, 0xed, 0x0e, 0x73, 0xe3, 0x29, 0x09, 0x5f, 0xe3, 0x0e, 0xa6,
+ 0xe7, 0xc6, 0x52, 0x12, 0xba, 0xdb, 0xb5, 0x63, 0xd9, 0xd8, 0xa4, 0x25,
+ 0x75, 0xb7, 0x6a, 0xc7, 0xb3, 0xad, 0x88, 0x46, 0x64, 0x3a, 0x36, 0xb1,
+ 0x2f, 0xb1, 0x03, 0xdb, 0x88, 0x74, 0x6d, 0x62, 0x5f, 0x62, 0x07, 0xb7,
+ 0x10, 0xe8, 0xda, 0xc6, 0x1d, 0x6e, 0x8e, 0x12, 0x58, 0x6e, 0x98, 0x4c,
+ 0xa1, 0x23, 0xc0, 0x9b, 0xb0, 0xdd, 0x31, 0xef, 0x64, 0xf0, 0x91, 0x37,
+ 0x61, 0xba, 0x63, 0xde, 0xc9, 0xe1, 0x22, 0x6e, 0xc3, 0x74, 0xc7, 0xea,
+ 0xcb, 0x70, 0xf6, 0xe2, 0x1d, 0x1b, 0x6c, 0xd5, 0x4f, 0x91, 0xc2, 0x4b,
+ 0x0a, 0xeb, 0xb3, 0x0d, 0x59, 0x39, 0x13, 0x76, 0x15, 0xd7, 0x66, 0x1a,
+ 0xf2, 0x72, 0x26, 0xec, 0x05, 0x3e, 0xcc, 0x31, 0x3e, 0x60, 0x4d, 0xd8,
+ 0x0a, 0x7d, 0x98, 0x62, 0x7c, 0xc0, 0xcc, 0x5a, 0x24, 0xc8, 0xa6, 0xda,
+ 0xe3, 0x09, 0x35, 0x70, 0x9c, 0x4c, 0x85, 0xac, 0x6f, 0x8b, 0x76, 0x30,
+ 0xcc, 0x6f, 0xcb, 0x3e, 0x36, 0xd6, 0xec, 0x61, 0x98, 0xdf, 0x99, 0xa5,
+ 0x7e, 0x2d, 0xd8, 0xc3, 0x31, 0xbf, 0x33, 0x4a, 0xfc, 0x5b, 0xb1, 0x86,
+ 0x63, 0x7e, 0x66, 0x95, 0xf8, 0xb7, 0x63, 0x0c, 0xc6, 0xfc, 0xcd, 0x2b,
+ 0xf1, 0x6e, 0xc6, 0x19, 0x8d, 0xf9, 0x9a, 0x57, 0xe2, 0xdd, 0x8c, 0x33,
+ 0x1b, 0xf3, 0x34, 0xaf, 0xc5, 0xbb, 0x18, 0x66, 0x37, 0xe6, 0x69, 0x5f,
+ 0x8b, 0x76, 0x30, 0xcc, 0x6f, 0xcc, 0xd2, 0xbf, 0x16, 0xec, 0x61, 0x98,
+ 0xdf, 0x99, 0xa5, 0x7e, 0x2d, 0xd1, 0x27, 0xb1, 0xbf, 0x30, 0x0b, 0xfc,
+ 0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7, 0x44, 0x9e, 0xc7,
+ 0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82, 0x6d, 0xea, 0xda,
+ 0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba, 0x24, 0xf6, 0x3a,
+ 0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13, 0x6f, 0x56, 0xd6,
+ 0xe8, 0x93, 0xd8, 0xe8, 0x26, 0xde, 0xad, 0xad, 0xd1, 0x27, 0xb1, 0xd0,
+ 0x4d, 0xbd, 0x5b, 0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7,
+ 0x44, 0x9e, 0xc7, 0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82,
+ 0x6d, 0xea, 0xda, 0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba,
+ 0x24, 0xf6, 0x3a, 0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13,
+ 0x6f, 0x56, 0xd6, 0xdf, 0x45, 0xaa, 0x16, 0xb7, 0xb7, 0x14, 0x09, 0xdb,
+ 0x9f, 0x17, 0x97, 0xae, 0xa1, 0xbe, 0x34, 0x9d, 0x0e, 0x01, 0x9f, 0xdb,
+ 0x16, 0xa9, 0x6a, 0x63, 0xf2, 0x9f, 0x5b, 0x3b, 0x0b, 0xae, 0x17, 0xd6,
+ 0x4d, 0x75, 0x8f, 0xe3, 0xf0, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c,
+ 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96,
+ 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f,
+ 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80,
+ 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f,
+ 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d,
+ 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a,
+ 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53,
+ 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91,
+ 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9,
+ 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95,
+ 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3,
+ 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7,
+ 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42,
+ 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7,
+ 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3,
+ 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65,
+ 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8,
+ 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34,
+ 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3,
+ 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38,
+ 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5,
+ 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d,
+ 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08,
+ 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6,
+ 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7,
+ 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2,
+ 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59,
+ 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52,
+ 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81,
+ 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa,
+ 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29,
+ 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f,
+ 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0,
+ 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a,
+ 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e,
+ 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65,
+ 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e,
+ 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4,
+ 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8,
+ 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b,
+ 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c,
+ 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e,
+ 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01,
+ 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07,
+ 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd,
+ 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8,
+ 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca,
+ 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe,
+ 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab,
+ 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a,
+ 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d,
+ 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09,
+ 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89,
+ 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61,
+ 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1,
+ 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c,
+ 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4,
+ 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4,
+ 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3,
+ 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a,
+ 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53,
+ 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa,
+ 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94,
+ 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f,
+ 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24,
+ 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7,
+ 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd,
+ 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a,
+ 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f,
+ 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e,
+ 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb,
+ 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe,
+ 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19,
+ 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad,
+ 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49,
+ 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad,
+ 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc,
+ 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50,
+ 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87,
+ 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce,
+ 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5,
+ 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2,
+ 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a,
+ 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14,
+ 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad,
+ 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71,
+ 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1,
+ 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46,
+ 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b,
+ 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb,
+ 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab,
+ 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b,
+ 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5,
+ 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d,
+ 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0,
+ 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3,
+ 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0,
+ 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00,
+ 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40,
+ 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66,
+ 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd,
+ 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e,
+ 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf,
+ 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15,
+ 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98,
+ 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4,
+ 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38,
+ 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84,
+ 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb,
+ 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5,
+ 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32,
+ 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5,
+ 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae,
+ 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe,
+ 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a,
+ 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12,
+ 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f,
+ 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64,
+ 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3,
+ 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79,
+ 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06,
+ 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f,
+ 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b,
+ 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3,
+ 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c,
+ 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f,
+ 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f,
+ 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8,
+ 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d,
+ 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba,
+ 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15,
+ 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d,
+ 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a,
+ 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c,
+ 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6,
+ 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7,
+ 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d,
+ 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3,
+ 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8,
+ 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d,
+ 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43,
+ 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36,
+ 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a,
+ 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12,
+ 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce,
+ 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95,
+ 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09,
+ 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55,
+ 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48,
+ 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd,
+ 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06,
+ 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6,
+ 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0,
+ 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a,
+ 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73,
+ 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6,
+ 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2,
+ 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd,
+ 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0,
+ 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74,
+ 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e,
+ 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39,
+ 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c,
+ 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7,
+ 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55,
+ 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1,
+ 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e,
+ 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55,
+ 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f,
+ 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b,
+ 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48,
+ 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a,
+ 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b,
+ 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65,
+ 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3,
+ 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0,
+ 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b,
+ 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3,
+ 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e,
+ 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4,
+ 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4,
+ 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a,
+ 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25,
+ 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38,
+ 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed,
+ 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0,
+ 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9,
+ 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74,
+ 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59,
+ 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6,
+ 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd,
+ 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c,
+ 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e,
+ 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d,
+ 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3,
+ 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82,
+ 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39,
+ 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71,
+ 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8,
+ 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96,
+ 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54,
+ 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0,
+ 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a,
+ 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a,
+ 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f,
+ 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30,
+ 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e,
+ 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf,
+ 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59,
+ 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb,
+ 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d,
+ 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e,
+ 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86,
+ 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f,
+ 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03,
+ 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00,
+ 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01,
+ 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33,
+ 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e,
+ 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72,
+ 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff,
+ 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa,
+ 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2,
+ 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23,
+ 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2,
+ 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22,
+ 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8,
+ 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28,
+ 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93,
+ 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad,
+ 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75,
+ 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0,
+ 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6,
+ 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94,
+ 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e,
+ 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25,
+ 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b,
+ 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9,
+ 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31,
+ 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff,
+ 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e,
+ 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f,
+ 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3,
+ 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a,
+ 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f,
+ 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46,
+ 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb,
+ 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2,
+ 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab,
+ 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef,
+ 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4,
+ 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61,
+ 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33,
+ 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d,
+ 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec,
+ 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a,
+ 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45,
+ 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b,
+ 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c,
+ 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4,
+ 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1,
+ 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92,
+ 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76,
+ 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa,
+ 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e,
+ 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9,
+ 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43,
+ 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec,
+ 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30,
+ 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0,
+ 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80,
+ 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50,
+ 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99,
+ 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33,
+ 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13,
+ 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef,
+ 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05,
+ 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6,
+ 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71,
+ 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce,
+ 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61,
+ 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e,
+ 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9,
+ 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c,
+ 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5,
+ 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab,
+ 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f,
+ 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e,
+ 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44,
+ 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53,
+ 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59,
+ 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c,
+ 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e,
+ 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01,
+ 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf,
+ 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a,
+ 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4,
+ 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7,
+ 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23,
+ 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3,
+ 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a,
+ 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7,
+ 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e,
+ 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85,
+ 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf,
+ 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6,
+ 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb,
+ 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1,
+ 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69,
+ 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67,
+ 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70,
+ 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a,
+ 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b,
+ 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10,
+ 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd,
+ 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e,
+ 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44,
+ 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3,
+ 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5,
+ 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02,
+ 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55,
+ 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52,
+ 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f,
+ 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81,
+ 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5,
+ 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc,
+ 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca,
+ 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc,
+ 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69,
+ 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70,
+ 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37,
+ 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8,
+ 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d,
+ 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03,
+ 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e,
+ 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b,
+ 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71,
+ 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95,
+ 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc,
+ 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57,
+ 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15,
+ 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b,
+ 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12,
+ 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12,
+ 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2,
+ 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42,
+ 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99,
+ 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68,
+ 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8,
+ 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86,
+ 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4,
+ 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7,
+ 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5,
+ 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29,
+ 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde,
+ 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49,
+ 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e,
+ 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb,
+ 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4,
+ 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe,
+ 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d,
+ 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6,
+ 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd,
+ 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33,
+ 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b,
+ 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93,
+ 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b,
+ 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78,
+ 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0,
+ 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e,
+ 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c,
+ 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea,
+ 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65,
+ 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5,
+ 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28,
+ 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a,
+ 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2,
+ 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3,
+ 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c,
+ 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97,
+ 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7,
+ 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56,
+ 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76,
+ 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b,
+ 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b,
+ 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61,
+ 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87,
+ 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80,
+ 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00,
+ 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80,
+ 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc,
+ 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b,
+ 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c,
+ 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f,
+ 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a,
+ 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30,
+ 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88,
+ 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70,
+ 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08,
+ 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6,
+ 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a,
+ 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64,
+ 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab,
+ 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d,
+ 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc,
+ 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5,
+ 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25,
+ 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f,
+ 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9,
+ 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66,
+ 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2,
+ 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c,
+ 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff,
+ 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7,
+ 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7,
+ 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38,
+ 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e,
+ 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f,
+ 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51,
+ 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a,
+ 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74,
+ 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a,
+ 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b,
+ 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35,
+ 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58,
+ 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c,
+ 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f,
+ 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b,
+ 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86,
+ 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51,
+ 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda,
+ 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87,
+ 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d,
+ 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74,
+ 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24,
+ 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d,
+ 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a,
+ 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13,
+ 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa,
+ 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90,
+ 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb,
+ 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c,
+ 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac,
+ 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0,
+ 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54,
+ 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6,
+ 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c,
+ 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84,
+ 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb,
+ 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1,
+ 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9,
+ 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c,
+ 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73,
+ 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8,
+ 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f,
+ 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa,
+ 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3,
+ 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd,
+ 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa,
+ 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf,
+ 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97,
+ 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91,
+ 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14,
+ 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16,
+ 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb,
+ 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47,
+ 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40,
+ 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37,
+ 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6,
+ 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d,
+ 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9,
+ 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48,
+ 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4,
+ 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a,
+ 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71,
+ 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb,
+ 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1,
+ 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3,
+ 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9,
+ 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2,
+ 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec,
+ 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a,
+ 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9,
+ 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c,
+ 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda,
+ 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6,
+ 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04,
+ 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73,
+ 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3,
+ 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51,
+ 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c,
+ 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9,
+ 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40,
+ 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5,
+ 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14,
+ 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f,
+ 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60,
+ 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd,
+ 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf,
+ 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2,
+ 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7,
+ 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a,
+ 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc,
+ 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d,
+ 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e,
+ 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07,
+ 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00,
+ 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03,
+ 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66,
+ 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc,
+ 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5,
+ 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff,
+ 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55,
+ 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85,
+ 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46,
+ 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84,
+ 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44,
+ 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0,
+ 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50,
+ 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26,
+ 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a,
+ 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea,
+ 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1,
+ 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad,
+ 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29,
+ 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd,
+ 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a,
+ 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37,
+ 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92,
+ 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63,
+ 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe,
+ 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd,
+ 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f,
+ 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7,
+ 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5,
+ 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff,
+ 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c,
+ 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6,
+ 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4,
+ 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56,
+ 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde,
+ 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8,
+ 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3,
+ 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67,
+ 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a,
+ 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9,
+ 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35,
+ 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a,
+ 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6,
+ 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38,
+ 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68,
+ 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3,
+ 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25,
+ 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed,
+ 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55,
+ 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d,
+ 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52,
+ 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86,
+ 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8,
+ 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61,
+ 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60,
+ 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00,
+ 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0,
+ 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33,
+ 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66,
+ 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27,
+ 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf,
+ 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a,
+ 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c,
+ 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2,
+ 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c,
+ 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2,
+ 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d,
+ 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52,
+ 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19,
+ 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea,
+ 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57,
+ 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff,
+ 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd,
+ 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89,
+ 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7,
+ 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2,
+ 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59,
+ 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c,
+ 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03,
+ 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf,
+ 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35,
+ 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9,
+ 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e,
+ 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47,
+ 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7,
+ 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54,
+ 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e,
+ 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd,
+ 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a,
+ 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e,
+ 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d,
+ 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96,
+ 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63,
+ 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3,
+ 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce,
+ 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1,
+ 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4,
+ 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36,
+ 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21,
+ 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b,
+ 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d,
+ 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89,
+ 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67,
+ 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a,
+ 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04,
+ 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa,
+ 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4,
+ 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe,
+ 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03,
+ 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb,
+ 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8,
+ 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95,
+ 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9,
+ 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3,
+ 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1,
+ 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e,
+ 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0,
+ 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a,
+ 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07,
+ 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c,
+ 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36,
+ 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3,
+ 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a,
+ 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8,
+ 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf,
+ 0x56, 0x8f, 0x24,
+};
+static_assert(sizeof(kBytesTestReadSymbol4) == kNumBytesTestReadSymbol4, "");
+
+// The kBytesTestReadSymbol5[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][6] = {
+// // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+// { 32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0 },
+// // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+// { 32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0 },
+// // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+// { 32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0 },
+// // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+// { 32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0 },
+// };
+// constexpr int kSymbols[10][4] = { { 0, 0, 4, 4 }, //
+// { 0, 1, 3, 3 }, //
+// { 1, 2, 2, 2 }, //
+// { 1, 3, 1, 1 }, //
+// { 2, 4, 0, 0 }, //
+// { 2, 0, 4, 3 }, //
+// { 3, 1, 3, 2 }, //
+// { 3, 2, 2, 1 }, //
+// { 4, 3, 1, 2 }, //
+// { 4, 0, 4, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 320; ++i) {
+// for (int j = 0; j < 10; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 5);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol5 = 3612;
+constexpr uint8_t kBytesTestReadSymbol5[] = {
+ 0x0f, 0x1c, 0x16, 0x78, 0x6f, 0x83, 0xfe, 0x29, 0x95, 0x9a, 0x42, 0xcc,
+ 0x70, 0x9a, 0x0d, 0x72, 0xe0, 0x7d, 0x63, 0x9e, 0x05, 0x3c, 0x88, 0x22,
+ 0x40, 0x57, 0x83, 0xa8, 0x69, 0x6f, 0xc3, 0xb2, 0x58, 0x6c, 0xa9, 0x41,
+ 0x3c, 0x2f, 0x3f, 0xa3, 0xe6, 0x4e, 0x5e, 0xaf, 0x42, 0x56, 0x9d, 0x3f,
+ 0x70, 0xeb, 0x00, 0x02, 0x86, 0x23, 0x5f, 0x8e, 0x1b, 0x35, 0x71, 0x7d,
+ 0x50, 0xbe, 0xb1, 0x1e, 0xe9, 0x2f, 0x08, 0x5a, 0x04, 0xc0, 0x7b, 0x98,
+ 0x20, 0xbd, 0xc5, 0x39, 0xf7, 0x93, 0x5c, 0x6c, 0x4a, 0x0f, 0x50, 0x24,
+ 0xe1, 0xf3, 0x2a, 0x8d, 0x53, 0x55, 0x9a, 0xd6, 0x3a, 0xd3, 0xd6, 0x9c,
+ 0x41, 0xa2, 0x2c, 0x05, 0x1c, 0x5a, 0x28, 0x8d, 0xc0, 0x4f, 0x8d, 0xc1,
+ 0x40, 0xaa, 0x19, 0xbf, 0xa7, 0x93, 0x48, 0xdf, 0x54, 0xcf, 0xb4, 0x47,
+ 0xc4, 0x39, 0x90, 0xbb, 0xff, 0xb4, 0x47, 0x65, 0x33, 0x34, 0x45, 0x23,
+ 0x5e, 0x79, 0xc5, 0xbd, 0x24, 0x30, 0x58, 0x8a, 0x19, 0x68, 0xbb, 0x08,
+ 0xaa, 0xff, 0xce, 0x68, 0x37, 0xb4, 0x62, 0x44, 0x31, 0xe8, 0x3e, 0x4d,
+ 0x05, 0x1d, 0xe2, 0x48, 0x56, 0xd5, 0x53, 0x19, 0xcc, 0xfd, 0x82, 0xa7,
+ 0x06, 0xc4, 0x66, 0x95, 0x6c, 0x43, 0x3d, 0x43, 0x86, 0xe3, 0x62, 0x51,
+ 0x26, 0x1c, 0x57, 0xed, 0x9a, 0x1a, 0x14, 0x4f, 0x41, 0x96, 0xc0, 0x72,
+ 0x38, 0x59, 0xff, 0x69, 0xae, 0x2b, 0x59, 0x65, 0x30, 0xfd, 0xa5, 0x6f,
+ 0x1b, 0xab, 0x01, 0x72, 0xb4, 0xcd, 0xba, 0x44, 0x73, 0x12, 0x31, 0xee,
+ 0x83, 0x08, 0x5c, 0x35, 0x41, 0x17, 0xf1, 0x80, 0x55, 0xdd, 0x67, 0xb2,
+ 0xd3, 0xe1, 0x04, 0x51, 0x69, 0x9b, 0x4b, 0x98, 0xcf, 0x17, 0x0a, 0xd4,
+ 0xdc, 0x61, 0xf2, 0xb9, 0x4b, 0x23, 0xb6, 0xe8, 0x0c, 0x0d, 0xda, 0x68,
+ 0xac, 0xd9, 0xf4, 0x11, 0x63, 0x4a, 0x7f, 0x17, 0x69, 0xdb, 0x91, 0x1b,
+ 0x1d, 0xfb, 0x74, 0x58, 0x69, 0xcc, 0xf5, 0xce, 0x0d, 0x1e, 0xdd, 0x6d,
+ 0x2e, 0x87, 0xf2, 0x36, 0x39, 0x22, 0x59, 0x78, 0x01, 0x2c, 0xf0, 0xe6,
+ 0x8c, 0xd1, 0xdb, 0xa4, 0xf4, 0xc4, 0x09, 0x0e, 0xfe, 0x93, 0x88, 0x90,
+ 0x3e, 0x55, 0x60, 0x51, 0x6a, 0xe9, 0x26, 0x41, 0x1f, 0x18, 0xab, 0xc1,
+ 0xa4, 0x66, 0x57, 0xdd, 0xe6, 0x88, 0xbd, 0x74, 0xa0, 0xd3, 0x65, 0x0d,
+ 0x04, 0xe3, 0x97, 0x1e, 0x9b, 0x59, 0xfc, 0xe2, 0x45, 0x9b, 0x90, 0xe1,
+ 0x80, 0x20, 0x85, 0x03, 0x06, 0x1f, 0x46, 0xb1, 0x69, 0xb4, 0xf3, 0x06,
+ 0xa8, 0xb5, 0x78, 0x2c, 0x21, 0xd1, 0x67, 0x8d, 0x91, 0xef, 0x6f, 0xec,
+ 0xed, 0x2c, 0xd7, 0x40, 0x32, 0x09, 0xed, 0x4e, 0x92, 0xbb, 0x28, 0x67,
+ 0xac, 0x09, 0x50, 0x7f, 0x30, 0xed, 0xde, 0x56, 0xeb, 0xc9, 0x23, 0x2f,
+ 0x13, 0x07, 0xef, 0x80, 0x9e, 0x83, 0x6a, 0x24, 0xd4, 0xd1, 0x84, 0xbe,
+ 0xf8, 0x1f, 0xb0, 0xaa, 0x6a, 0xf0, 0xda, 0x02, 0x0c, 0x94, 0xc9, 0xbc,
+ 0x0f, 0xe8, 0x76, 0x95, 0x79, 0x0e, 0x24, 0x1e, 0x4c, 0xdb, 0xe5, 0xd5,
+ 0x20, 0xee, 0x13, 0xff, 0xba, 0x1f, 0x7f, 0x67, 0x89, 0x4b, 0x6b, 0x28,
+ 0x33, 0x61, 0xfb, 0x53, 0xed, 0xf7, 0x13, 0x3f, 0x64, 0xc9, 0x26, 0x19,
+ 0xde, 0xe6, 0xec, 0x74, 0xe0, 0x0e, 0x7b, 0x07, 0xeb, 0xd9, 0xac, 0x7e,
+ 0x1d, 0xac, 0xba, 0xa0, 0x50, 0xc4, 0x12, 0xee, 0x58, 0xe5, 0xe9, 0x7c,
+ 0xa3, 0x40, 0xbd, 0x92, 0x6d, 0xa8, 0x08, 0x3c, 0x9e, 0xdb, 0xd3, 0x08,
+ 0x3d, 0xb3, 0x1c, 0x25, 0x09, 0x51, 0x55, 0xbb, 0x51, 0xc8, 0xe6, 0xd6,
+ 0x30, 0x86, 0x25, 0xa9, 0x01, 0xed, 0x55, 0x11, 0xa4, 0x5e, 0x3f, 0x57,
+ 0xb7, 0x9b, 0x64, 0xec, 0x3d, 0x93, 0x28, 0x34, 0xea, 0xe9, 0x53, 0xec,
+ 0x71, 0x7c, 0x1c, 0xee, 0x03, 0x26, 0x1a, 0x15, 0x9f, 0x6c, 0x74, 0xa5,
+ 0xe1, 0x04, 0x76, 0xcb, 0x0b, 0xf9, 0x96, 0x4f, 0x4e, 0xb6, 0x7e, 0xad,
+ 0xc5, 0x4b, 0x37, 0x44, 0x91, 0xfd, 0x1d, 0x69, 0x11, 0x17, 0x82, 0xc4,
+ 0x17, 0x39, 0x29, 0x99, 0x8f, 0xe1, 0x35, 0x4d, 0x9e, 0x4f, 0xc9, 0x98,
+ 0x71, 0x6b, 0xa9, 0x0d, 0x0a, 0xf8, 0xb6, 0x3a, 0x52, 0xf0, 0x82, 0x3b,
+ 0x65, 0x79, 0x60, 0x16, 0xa5, 0xa4, 0xf8, 0x0e, 0xc2, 0x3e, 0xf3, 0x23,
+ 0x82, 0x4d, 0x1f, 0x9d, 0x7b, 0xe1, 0xb8, 0xd3, 0x79, 0xc4, 0x04, 0x1d,
+ 0xfc, 0xbc, 0xdb, 0x37, 0x73, 0x27, 0xe3, 0x8d, 0x65, 0xcb, 0x72, 0xd2,
+ 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x13, 0x0d, 0x80, 0xf6, 0xaa, 0x90,
+ 0xd2, 0x30, 0x87, 0x1b, 0xdb, 0xcd, 0xb9, 0xea, 0x28, 0xfa, 0x10, 0xd5,
+ 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4,
+ 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d,
+ 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43,
+ 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9,
+ 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc,
+ 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75,
+ 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2,
+ 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a,
+ 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5,
+ 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34,
+ 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba,
+ 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8,
+ 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d,
+ 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11,
+ 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d,
+ 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f,
+ 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23,
+ 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84,
+ 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d,
+ 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d,
+ 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17,
+ 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f,
+ 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4,
+ 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e,
+ 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3,
+ 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b,
+ 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda,
+ 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72,
+ 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21,
+ 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10,
+ 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2,
+ 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2,
+ 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18,
+ 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79,
+ 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93,
+ 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1,
+ 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6,
+ 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad,
+ 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68,
+ 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c,
+ 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39,
+ 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d,
+ 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7,
+ 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92,
+ 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1,
+ 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c,
+ 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f,
+ 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91,
+ 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87,
+ 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29,
+ 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f,
+ 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f,
+ 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a,
+ 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06,
+ 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02,
+ 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53,
+ 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96,
+ 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb,
+ 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99,
+ 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa,
+ 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15,
+ 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57,
+ 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9,
+ 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8,
+ 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62,
+ 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6,
+ 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6,
+ 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9,
+ 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30,
+ 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0,
+ 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25,
+ 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69,
+ 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc,
+ 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59,
+ 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a,
+ 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21,
+ 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95,
+ 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec,
+ 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf,
+ 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6,
+ 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea,
+ 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c,
+ 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc,
+ 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3,
+ 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae,
+ 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92,
+ 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56,
+ 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b,
+ 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5,
+ 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2,
+ 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42,
+ 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69,
+ 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e,
+ 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a,
+ 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a,
+ 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e,
+ 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21,
+ 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec,
+ 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee,
+ 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba,
+ 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79,
+ 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5,
+ 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72,
+ 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a,
+ 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd,
+ 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4,
+ 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96,
+ 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08,
+ 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86,
+ 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17,
+ 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91,
+ 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2,
+ 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce,
+ 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e,
+ 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b,
+ 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7,
+ 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a,
+ 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47,
+ 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61,
+ 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd,
+ 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d,
+ 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9,
+ 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90,
+ 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08,
+ 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1,
+ 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9,
+ 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c,
+ 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c,
+ 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49,
+ 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78,
+ 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b,
+ 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6,
+ 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34,
+ 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16,
+ 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c,
+ 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6,
+ 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b,
+ 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9,
+ 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50,
+ 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae,
+ 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf,
+ 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48,
+ 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3,
+ 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14,
+ 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37,
+ 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37,
+ 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d,
+ 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83,
+ 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01,
+ 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29,
+ 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b,
+ 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5,
+ 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc,
+ 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55,
+ 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a,
+ 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab,
+ 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64,
+ 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c,
+ 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31,
+ 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53,
+ 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3,
+ 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64,
+ 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18,
+ 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70,
+ 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92,
+ 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4,
+ 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e,
+ 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c,
+ 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95,
+ 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10,
+ 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a,
+ 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76,
+ 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57,
+ 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3,
+ 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5,
+ 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e,
+ 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66,
+ 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71,
+ 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7,
+ 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9,
+ 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b,
+ 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95,
+ 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2,
+ 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9,
+ 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1,
+ 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4,
+ 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47,
+ 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35,
+ 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd,
+ 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f,
+ 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10,
+ 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76,
+ 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7,
+ 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d,
+ 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc,
+ 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52,
+ 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39,
+ 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d,
+ 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e,
+ 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a,
+ 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb,
+ 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84,
+ 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43,
+ 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b,
+ 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8,
+ 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61,
+ 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7,
+ 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f,
+ 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5,
+ 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb,
+ 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5,
+ 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3,
+ 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0,
+ 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6,
+ 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6,
+ 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc,
+ 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48,
+ 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84,
+ 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70,
+ 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc,
+ 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46,
+ 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e,
+ 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4,
+ 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc,
+ 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd,
+ 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b,
+ 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a,
+ 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b,
+ 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e,
+ 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b,
+ 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d,
+ 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64,
+ 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8,
+ 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57,
+ 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f,
+ 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24,
+ 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1,
+ 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a,
+ 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b,
+ 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b,
+ 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26,
+ 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1,
+ 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80,
+ 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94,
+ 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5,
+ 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2,
+ 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66,
+ 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x75, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol5) == kNumBytesTestReadSymbol5, "");
+
+// The kBytesTestReadSymbol6[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][7] = {
+// // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+// { 32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845, 32768 - 27307,
+// 0, 0 },
+// // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+// { 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576, 32768 - 30037,
+// 0, 0 },
+// // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+// { 32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+// 0, 0 },
+// // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+// { 32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576, 32768 - 30037,
+// 0, 0 },
+// };
+// constexpr int kSymbols[12][4] = { { 0, 0, 5, 5 }, //
+// { 0, 1, 4, 4 }, //
+// { 1, 2, 3, 3 }, //
+// { 1, 3, 2, 2 }, //
+// { 2, 4, 1, 1 }, //
+// { 2, 5, 0, 0 }, //
+// { 3, 0, 5, 4 }, //
+// { 3, 1, 4, 3 }, //
+// { 4, 2, 3, 2 }, //
+// { 4, 3, 2, 1 }, //
+// { 5, 4, 1, 3 }, //
+// { 5, 0, 5, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 256; ++i) {
+// for (int j = 0; j < 12; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 6);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol6 = 3917;
+constexpr uint8_t kBytesTestReadSymbol6[] = {
+ 0x0a, 0x8e, 0xb8, 0x15, 0xd5, 0x69, 0x63, 0x06, 0x48, 0x75, 0xf4, 0x4c,
+ 0xfa, 0x13, 0xba, 0x68, 0x61, 0xa6, 0x9f, 0x39, 0x63, 0xba, 0x63, 0x26,
+ 0xa8, 0xaa, 0xd0, 0x10, 0x4a, 0x05, 0xaf, 0x5f, 0x65, 0x57, 0x2f, 0x68,
+ 0x48, 0x2c, 0x64, 0xdf, 0x0a, 0x93, 0xcc, 0x84, 0x43, 0x97, 0x34, 0x79,
+ 0x10, 0x05, 0x4d, 0x58, 0xe9, 0xc3, 0xb4, 0x4a, 0x70, 0xd4, 0x81, 0x71,
+ 0x9f, 0x6b, 0x18, 0xb3, 0x72, 0xdf, 0x37, 0x87, 0x3e, 0x40, 0xd0, 0xff,
+ 0x10, 0x32, 0x22, 0xe4, 0x36, 0xef, 0xa2, 0x5e, 0x39, 0x5d, 0x42, 0x59,
+ 0x8c, 0x3f, 0x1b, 0x41, 0xdb, 0xc2, 0x8c, 0x64, 0xaf, 0xd2, 0x49, 0x45,
+ 0xd8, 0xad, 0x85, 0x3b, 0x70, 0x13, 0x83, 0x63, 0x49, 0x86, 0x35, 0xfe,
+ 0x93, 0x6b, 0x51, 0x0e, 0x32, 0x3d, 0xf0, 0x30, 0xe0, 0xf5, 0x42, 0x59,
+ 0x33, 0x8e, 0x63, 0x62, 0x46, 0x00, 0x69, 0x06, 0x52, 0x83, 0x37, 0x0b,
+ 0x37, 0x12, 0x38, 0x3b, 0x9c, 0xc3, 0x00, 0xed, 0x0a, 0xd4, 0xed, 0x69,
+ 0x01, 0xc5, 0x3a, 0x14, 0x29, 0xaf, 0x3e, 0x9c, 0x0a, 0xaf, 0x56, 0x50,
+ 0x56, 0xcd, 0xa1, 0xb0, 0x88, 0xef, 0xa7, 0x57, 0xe6, 0xe8, 0x2c, 0x42,
+ 0x60, 0x55, 0x22, 0x1f, 0xcc, 0x50, 0xa9, 0xda, 0xc2, 0x73, 0x19, 0x2e,
+ 0xfb, 0x74, 0x88, 0x42, 0x0d, 0x49, 0x12, 0x5e, 0x36, 0x43, 0xe7, 0x33,
+ 0x00, 0x7d, 0xd5, 0x35, 0xa3, 0xaf, 0x1e, 0x93, 0x5e, 0xe6, 0xae, 0x23,
+ 0x41, 0x55, 0x05, 0x19, 0xde, 0xa7, 0xf1, 0x07, 0xbd, 0x58, 0xc1, 0x10,
+ 0x0a, 0x4b, 0x5c, 0xee, 0xe3, 0xfb, 0xe5, 0xf5, 0xfc, 0x1a, 0x4e, 0x51,
+ 0xda, 0x3e, 0xc5, 0x36, 0xda, 0x3e, 0x83, 0xfd, 0x6b, 0x6f, 0x54, 0xdb,
+ 0x68, 0x5a, 0x9c, 0x46, 0xbf, 0x86, 0x23, 0xf1, 0xbd, 0xe1, 0x79, 0x5e,
+ 0xf7, 0x1c, 0xe0, 0xf7, 0xa6, 0xd5, 0x9f, 0x0b, 0x74, 0xd8, 0xf2, 0x0a,
+ 0x97, 0x71, 0xa2, 0xd2, 0x37, 0x05, 0x7e, 0x3e, 0xa4, 0xec, 0x16, 0x92,
+ 0x37, 0xdd, 0x45, 0x0c, 0x17, 0x42, 0xf0, 0x34, 0xf7, 0x38, 0x04, 0xdf,
+ 0xb8, 0xb4, 0xd6, 0xa0, 0x2c, 0x56, 0x96, 0x10, 0x30, 0x34, 0x10, 0x39,
+ 0x9e, 0x95, 0x3b, 0x13, 0xf3, 0x60, 0xa1, 0x48, 0xca, 0x9f, 0x91, 0xfe,
+ 0x42, 0xfb, 0xdf, 0x37, 0xf8, 0x5d, 0x49, 0x82, 0x42, 0x4f, 0x90, 0xdf,
+ 0xae, 0x32, 0x20, 0x9e, 0xb6, 0xcc, 0xa0, 0x30, 0x07, 0x15, 0x64, 0xb8,
+ 0x56, 0x84, 0x1e, 0x16, 0xa3, 0x35, 0xad, 0x14, 0x9d, 0x62, 0x65, 0x0c,
+ 0x77, 0x82, 0x74, 0x41, 0x9c, 0x68, 0x95, 0x03, 0x4f, 0xfc, 0x1c, 0xc7,
+ 0xd6, 0xe6, 0xe7, 0xb3, 0x54, 0x66, 0x87, 0xb6, 0x41, 0x03, 0xe2, 0x20,
+ 0xf7, 0xdb, 0x2a, 0x0a, 0x25, 0x20, 0x60, 0xdf, 0xfd, 0x9f, 0x5f, 0x2c,
+ 0x72, 0x5f, 0x2b, 0xf4, 0x07, 0x9f, 0xf3, 0x8a, 0xde, 0xf0, 0x4f, 0x8a,
+ 0xa7, 0x75, 0xe3, 0xe8, 0xc9, 0xa1, 0xa0, 0x01, 0xa1, 0x20, 0xc8, 0xfb,
+ 0xf9, 0x91, 0xd2, 0x23, 0x4f, 0x6c, 0x53, 0x3b, 0x12, 0x01, 0xac, 0x1f,
+ 0x89, 0x84, 0x98, 0xcd, 0x3c, 0x74, 0x51, 0x92, 0xbe, 0x87, 0x06, 0x62,
+ 0x49, 0xd2, 0x1b, 0x27, 0xfa, 0x28, 0xf8, 0xbd, 0xbb, 0x7a, 0x7d, 0xde,
+ 0xa2, 0x9c, 0x1b, 0x7c, 0x80, 0xe8, 0xe0, 0x43, 0x64, 0xdd, 0x22, 0x7e,
+ 0x2c, 0xe4, 0x79, 0x2e, 0xbd, 0x98, 0x1a, 0x59, 0x7e, 0xbe, 0xfd, 0x9e,
+ 0x0c, 0x31, 0x50, 0x10, 0xdd, 0x62, 0x3c, 0x47, 0x9a, 0x11, 0x1b, 0x48,
+ 0xf3, 0xd1, 0x2c, 0x1b, 0xc2, 0xb5, 0x57, 0x7c, 0xe5, 0x97, 0x6d, 0x78,
+ 0xe7, 0xa2, 0xd6, 0x57, 0x61, 0x95, 0xed, 0x8d, 0xda, 0xc6, 0xdf, 0x2c,
+ 0x1d, 0x48, 0xee, 0x53, 0xd8, 0x1e, 0x80, 0x41, 0xce, 0x58, 0x08, 0x96,
+ 0x6f, 0x82, 0x6e, 0x28, 0x6a, 0x5a, 0x2b, 0x4f, 0x02, 0x4d, 0x99, 0x32,
+ 0xea, 0x60, 0xce, 0x75, 0x57, 0x0c, 0x63, 0xf0, 0xda, 0x51, 0x1d, 0xcc,
+ 0xb8, 0x21, 0x35, 0x10, 0x56, 0xaf, 0x80, 0xb3, 0x0f, 0x17, 0x29, 0x0c,
+ 0x16, 0x07, 0x66, 0xe9, 0xcb, 0x52, 0xcd, 0xec, 0xb1, 0x79, 0xf8, 0xb9,
+ 0x05, 0x08, 0xa1, 0xd7, 0x03, 0x6f, 0x8e, 0x9a, 0x6e, 0xfb, 0x38, 0x3a,
+ 0xff, 0xa7, 0xa1, 0xd8, 0xb1, 0x56, 0x06, 0xde, 0xb1, 0xe7, 0x47, 0xc2,
+ 0xc2, 0xab, 0xa9, 0x5f, 0x01, 0x65, 0x5d, 0x4c, 0xac, 0xd8, 0x1c, 0xfd,
+ 0x2d, 0x55, 0x74, 0x8a, 0x2b, 0x41, 0x2d, 0x50, 0x0c, 0x9c, 0x64, 0xb2,
+ 0xed, 0xaf, 0x2a, 0xb4, 0x58, 0x93, 0xd8, 0xc2, 0xab, 0x04, 0x45, 0xfc,
+ 0xd7, 0x02, 0x1e, 0x14, 0xd4, 0x38, 0xba, 0x24, 0x07, 0x9a, 0x25, 0x52,
+ 0x13, 0xe1, 0xe4, 0x26, 0x66, 0x12, 0xba, 0x13, 0x11, 0x25, 0xea, 0x29,
+ 0xc5, 0xff, 0x34, 0xca, 0x18, 0x34, 0x97, 0x4a, 0x92, 0x00, 0xe8, 0x61,
+ 0x18, 0x85, 0x0b, 0x56, 0x83, 0x48, 0xf9, 0xdb, 0x26, 0x7b, 0x54, 0xc8,
+ 0xd2, 0x63, 0x1e, 0x7b, 0x25, 0x3c, 0x4a, 0xa6, 0xda, 0x10, 0x92, 0xca,
+ 0x8a, 0x2c, 0x89, 0x60, 0x8e, 0xda, 0xf2, 0xab, 0x45, 0x89, 0x3d, 0x8c,
+ 0x2d, 0x35, 0xda, 0xc1, 0x7c, 0x3d, 0x05, 0x8e, 0xad, 0x5b, 0xff, 0x7d,
+ 0x46, 0x7b, 0x74, 0x71, 0xec, 0x05, 0x9a, 0x85, 0xa4, 0x4f, 0xc3, 0x54,
+ 0x64, 0x90, 0xe5, 0x97, 0x89, 0x1a, 0xb0, 0x56, 0x30, 0x13, 0xda, 0x44,
+ 0x2c, 0xb0, 0x50, 0x0c, 0x64, 0x43, 0x4a, 0xd2, 0x2a, 0xb4, 0x8f, 0x9d,
+ 0xa6, 0xe5, 0x3c, 0x0c, 0x7a, 0x44, 0xb3, 0xeb, 0xa7, 0x92, 0xe5, 0x59,
+ 0xa6, 0x43, 0xe9, 0x2b, 0x1f, 0x69, 0x4a, 0xc4, 0x89, 0xe7, 0xe0, 0x04,
+ 0x9f, 0x1d, 0x33, 0x61, 0xe8, 0xab, 0x75, 0x8d, 0x30, 0xd6, 0x7c, 0xca,
+ 0x02, 0xbe, 0xf9, 0x1d, 0x02, 0x4e, 0x0f, 0x88, 0xc9, 0x3f, 0x54, 0x9d,
+ 0x93, 0x0d, 0x44, 0xf8, 0xf6, 0xa7, 0x1a, 0xb6, 0x8b, 0xf5, 0x14, 0xca,
+ 0xbd, 0x6c, 0x2d, 0x9e, 0xfa, 0x80, 0x36, 0x53, 0x06, 0xac, 0x39, 0x0f,
+ 0x6b, 0xdb, 0x2e, 0xe0, 0x4f, 0xf0, 0xa4, 0x44, 0x5a, 0xbb, 0xaa, 0x72,
+ 0x59, 0x3f, 0x58, 0x38, 0xe5, 0x5c, 0x76, 0x31, 0xe6, 0xfe, 0x08, 0x20,
+ 0xbe, 0x3f, 0xea, 0x00, 0x0d, 0x34, 0xd9, 0x4d, 0x06, 0x0a, 0xb5, 0x04,
+ 0x7b, 0x48, 0x22, 0xa9, 0x94, 0x47, 0x44, 0xfd, 0x65, 0x81, 0x45, 0x56,
+ 0x91, 0xf3, 0xb4, 0xdc, 0xa7, 0x6e, 0xb1, 0xa4, 0xc5, 0xd6, 0x81, 0x6a,
+ 0x78, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+ 0xa7, 0xa7, 0xf2, 0x17, 0x92, 0x06, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+ 0x7f, 0x99, 0x40, 0x57, 0xdf, 0x23, 0xa0, 0x49, 0xc1, 0xf1, 0x19, 0x27,
+ 0xea, 0x93, 0xb2, 0x61, 0xa8, 0x9f, 0x1e, 0xd4, 0xe3, 0x56, 0xd1, 0x7e,
+ 0xa2, 0x99, 0x57, 0xad, 0x85, 0xb3, 0xdf, 0x50, 0x06, 0xca, 0x60, 0xd5,
+ 0x87, 0x21, 0xed, 0x7b, 0x65, 0xdc, 0x09, 0xfe, 0x14, 0x88, 0x8b, 0x57,
+ 0x75, 0x4e, 0x4b, 0x27, 0xeb, 0x07, 0x1c, 0xab, 0x8e, 0xc6, 0x3c, 0xdf,
+ 0xc1, 0x04, 0x17, 0xc7, 0xfd, 0x40, 0x01, 0xa6, 0x9b, 0x29, 0xa0, 0xc1,
+ 0x56, 0xa0, 0x8f, 0x69, 0x04, 0x55, 0x32, 0x88, 0xe8, 0x9f, 0x8d, 0x2b,
+ 0x48, 0xaa, 0xd2, 0x3e, 0x76, 0x9b, 0x94, 0xed, 0xd6, 0x34, 0x98, 0xba,
+ 0x16, 0x3c, 0x29, 0xce, 0x3d, 0x14, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa,
+ 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59,
+ 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5,
+ 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46,
+ 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97,
+ 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04,
+ 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5,
+ 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a,
+ 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77,
+ 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc,
+ 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f,
+ 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4,
+ 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0,
+ 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57,
+ 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca,
+ 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a,
+ 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c,
+ 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84,
+ 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61,
+ 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4,
+ 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e,
+ 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f,
+ 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1,
+ 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c,
+ 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43,
+ 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f,
+ 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1,
+ 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda,
+ 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27,
+ 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c,
+ 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91,
+ 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3,
+ 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28,
+ 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7,
+ 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08,
+ 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c,
+ 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec,
+ 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99,
+ 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1,
+ 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23,
+ 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06,
+ 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42,
+ 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25,
+ 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa,
+ 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7,
+ 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42,
+ 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c,
+ 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8,
+ 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff,
+ 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a,
+ 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b,
+ 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe,
+ 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20,
+ 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55,
+ 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde,
+ 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02,
+ 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6,
+ 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12,
+ 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c,
+ 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c,
+ 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53,
+ 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d,
+ 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb,
+ 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0,
+ 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7,
+ 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54,
+ 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2,
+ 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb,
+ 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d,
+ 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e,
+ 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09,
+ 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb,
+ 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94,
+ 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef,
+ 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99,
+ 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f,
+ 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48,
+ 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0,
+ 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf,
+ 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94,
+ 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95,
+ 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19,
+ 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08,
+ 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2,
+ 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69,
+ 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd,
+ 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f,
+ 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2,
+ 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18,
+ 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87,
+ 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff,
+ 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42,
+ 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4,
+ 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f,
+ 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9,
+ 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22,
+ 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6,
+ 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50,
+ 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf,
+ 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10,
+ 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39,
+ 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8,
+ 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32,
+ 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63,
+ 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46,
+ 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d,
+ 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85,
+ 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a,
+ 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55,
+ 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce,
+ 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84,
+ 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19,
+ 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0,
+ 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe,
+ 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94,
+ 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37,
+ 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc,
+ 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41,
+ 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa,
+ 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd,
+ 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05,
+ 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc,
+ 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24,
+ 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78,
+ 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18,
+ 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6,
+ 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb,
+ 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96,
+ 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40,
+ 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e,
+ 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8,
+ 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64,
+ 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97,
+ 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a,
+ 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+ 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+ 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97,
+ 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29,
+ 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde,
+ 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32,
+ 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e,
+ 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90,
+ 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41,
+ 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e,
+ 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29,
+ 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b,
+ 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32,
+ 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10,
+ 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84,
+ 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2,
+ 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba,
+ 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e,
+ 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84,
+ 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30,
+ 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e,
+ 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff,
+ 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84,
+ 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69,
+ 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f,
+ 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2,
+ 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45,
+ 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d,
+ 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0,
+ 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e,
+ 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21,
+ 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73,
+ 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0,
+ 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65,
+ 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7,
+ 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c,
+ 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a,
+ 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a,
+ 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95,
+ 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab,
+ 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c,
+ 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08,
+ 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32,
+ 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0,
+ 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc,
+ 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29,
+ 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e,
+ 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9,
+ 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83,
+ 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54,
+ 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a,
+ 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a,
+ 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99,
+ 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49,
+ 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1,
+ 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30,
+ 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c,
+ 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6,
+ 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d,
+ 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81,
+ 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c,
+ 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51,
+ 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8,
+ 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f,
+ 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34,
+ 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb,
+ 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24,
+ 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f,
+ 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52,
+ 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc,
+ 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65,
+ 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc,
+ 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21,
+ 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83,
+ 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd,
+ 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53,
+ 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56,
+ 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64,
+ 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20,
+ 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08,
+ 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4,
+ 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75,
+ 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c,
+ 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08,
+ 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61,
+ 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d,
+ 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff,
+ 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09,
+ 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3,
+ 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f,
+ 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4,
+ 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a,
+ 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b,
+ 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40,
+ 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c,
+ 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42,
+ 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7,
+ 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61,
+ 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca,
+ 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f,
+ 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19,
+ 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34,
+ 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14,
+ 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a,
+ 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56,
+ 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39,
+ 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11,
+ 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65,
+ 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41,
+ 0x27, 0xf9, 0x94, 0x05, 0xa0,
+};
+static_assert(sizeof(kBytesTestReadSymbol6) == kNumBytesTestReadSymbol6, "");
+
+// The kBytesTestReadSymbol7[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][8] = {
+// // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+// { 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+// 32768 - 23406, 32768 - 28087, 0, 0 },
+// // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+// { 32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+// 32768 - 25746, 32768 - 30427, 0, 0 },
+// // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+// { 32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043,
+// 32768 - 18725, 32768 - 25746, 0, 0 },
+// // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+// { 32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+// 32768 - 25746, 32768 - 30427, 0, 0 },
+// };
+// constexpr int kSymbols[14][4] = { { 0, 4, 6, 3 }, //
+// { 1, 5, 5, 2 }, //
+// { 2, 6, 4, 1 }, //
+// { 3, 0, 3, 0 }, //
+// { 4, 1, 2, 6 }, //
+// { 5, 2, 1, 5 }, //
+// { 6, 3, 0, 4 }, //
+// { 0, 0, 6, 5 }, //
+// { 2, 1, 4, 3 }, //
+// { 4, 3, 6, 1 }, //
+// { 6, 5, 2, 4 }, //
+// { 1, 0, 5, 2 }, //
+// { 3, 2, 3, 2 }, //
+// { 5, 4, 5, 3 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+// for (int j = 0; j < 14; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 7);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf(" constexpr size_t kNumBytesTestReadSymbol7 = %u;\n", bw.pos);
+// printf(" constexpr uint8_t kBytesTestReadSymbol7[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n };\n");
+
+constexpr size_t kNumBytesTestReadSymbol7 = 19874;
+constexpr uint8_t kBytesTestReadSymbol7[] = {
+ 0x1c, 0x6a, 0xfc, 0x4b, 0xd1, 0xb5, 0x8c, 0x20, 0x72, 0x45, 0x48, 0x21,
+ 0x9e, 0x71, 0xe8, 0xc4, 0x91, 0x51, 0xab, 0xfd, 0x9c, 0x61, 0xf7, 0x98,
+ 0xd4, 0x87, 0x71, 0xe6, 0x23, 0x37, 0x7e, 0xa3, 0xe0, 0x83, 0x48, 0x2e,
+ 0xfe, 0xc3, 0xcb, 0x4f, 0x26, 0x9a, 0xd7, 0xe4, 0xca, 0xf4, 0x94, 0xb7,
+ 0xbc, 0x03, 0xc9, 0xc3, 0x5e, 0x7f, 0xef, 0x9b, 0x37, 0xff, 0x8f, 0x62,
+ 0xec, 0xb6, 0x09, 0x50, 0xa9, 0xc1, 0x4a, 0x97, 0xf4, 0xe7, 0x08, 0x57,
+ 0x87, 0x2d, 0x10, 0xca, 0xbc, 0x93, 0x85, 0xfb, 0xc8, 0xc7, 0x8f, 0xc1,
+ 0x4e, 0x1f, 0x50, 0xad, 0xba, 0x09, 0x9c, 0xf8, 0x94, 0x75, 0xdd, 0x2c,
+ 0x78, 0x5d, 0xa0, 0x4a, 0xf3, 0x7b, 0xc0, 0xa7, 0x71, 0xa5, 0x20, 0xe6,
+ 0xb0, 0xca, 0x09, 0xf2, 0x38, 0xfc, 0x61, 0x49, 0xdc, 0x83, 0x35, 0x1e,
+ 0xdd, 0x08, 0xd7, 0xaa, 0x50, 0x0e, 0xc5, 0x57, 0x05, 0x44, 0xd7, 0xdb,
+ 0x56, 0x2b, 0x1e, 0xe5, 0x33, 0x08, 0x7c, 0x3d, 0x25, 0x29, 0x05, 0x14,
+ 0x3a, 0x93, 0xff, 0xe7, 0x40, 0x25, 0x30, 0x17, 0xc3, 0x50, 0xad, 0xec,
+ 0xb3, 0x64, 0x87, 0x35, 0xb2, 0x5a, 0x1e, 0xa9, 0x48, 0xc8, 0x53, 0x30,
+ 0xf1, 0x43, 0x6f, 0xe1, 0x2a, 0x8b, 0x81, 0x49, 0xbc, 0xa8, 0x8a, 0x8b,
+ 0x2d, 0x1a, 0xc5, 0xcb, 0x47, 0xc1, 0xbc, 0xe0, 0x54, 0x98, 0xcc, 0x82,
+ 0xe9, 0xa6, 0x3f, 0x70, 0x55, 0xe3, 0xe0, 0x7d, 0x5f, 0xa9, 0xc4, 0xc1,
+ 0x62, 0x04, 0x2d, 0x15, 0xce, 0xab, 0x7c, 0xd9, 0x88, 0xc1, 0x67, 0x88,
+ 0x3d, 0x6e, 0x96, 0x03, 0x6f, 0xa7, 0x6a, 0xc2, 0x6f, 0x20, 0x8c, 0xf4,
+ 0xfb, 0x96, 0x0c, 0xb7, 0x14, 0xef, 0xa6, 0x83, 0xbd, 0x2b, 0x07, 0x8a,
+ 0x2a, 0x66, 0xb8, 0x0d, 0xa8, 0x72, 0x2a, 0x78, 0x90, 0x2a, 0xe4, 0x46,
+ 0x71, 0x8c, 0xcb, 0xcb, 0xbd, 0xfb, 0xc7, 0xa8, 0x9e, 0x9b, 0x6e, 0x6d,
+ 0x2b, 0xc2, 0x1c, 0xea, 0x16, 0x3a, 0x06, 0xc0, 0xbc, 0xd7, 0x30, 0x8d,
+ 0x87, 0x03, 0x04, 0x0d, 0x58, 0x58, 0x7b, 0x40, 0xf5, 0xe5, 0x7a, 0x51,
+ 0x80, 0x7a, 0x16, 0xc2, 0xaf, 0x83, 0x43, 0x16, 0xb3, 0x3a, 0x1b, 0x24,
+ 0x29, 0x80, 0x60, 0xee, 0x00, 0x91, 0x15, 0xdb, 0x28, 0x0d, 0xc2, 0xfb,
+ 0x74, 0x48, 0xd9, 0x54, 0x97, 0x66, 0xa4, 0xba, 0xc8, 0x19, 0xff, 0x25,
+ 0xca, 0xdf, 0x09, 0x66, 0xe4, 0xfe, 0xbb, 0x2b, 0x3f, 0x4a, 0x81, 0x5a,
+ 0xa6, 0x54, 0x5c, 0xf0, 0xe4, 0x49, 0x38, 0x13, 0xfb, 0xa2, 0xee, 0xf9,
+ 0x7d, 0x72, 0xa9, 0x37, 0x12, 0xf4, 0x04, 0x4e, 0x50, 0x19, 0x6f, 0x29,
+ 0x9d, 0x0d, 0xe7, 0xc3, 0x6d, 0x65, 0x0b, 0x04, 0x53, 0x57, 0x0c, 0xb5,
+ 0x71, 0xb4, 0xd6, 0xb0, 0xaa, 0xed, 0x38, 0x9e, 0x58, 0x55, 0x0d, 0xe4,
+ 0xe6, 0x43, 0x16, 0x93, 0x46, 0x73, 0x39, 0x87, 0xaa, 0x69, 0x07, 0x9f,
+ 0xd7, 0xb6, 0x77, 0x7d, 0xef, 0xc7, 0x19, 0x5d, 0x4f, 0x60, 0x20, 0x7e,
+ 0xf0, 0x34, 0xbe, 0xe4, 0x31, 0xf3, 0x72, 0xe0, 0x89, 0xfb, 0xc8, 0x0a,
+ 0xa9, 0xe6, 0x2c, 0x6b, 0xa5, 0xaa, 0xd5, 0x42, 0x69, 0xc0, 0x27, 0x3b,
+ 0x17, 0x98, 0x73, 0xa3, 0x66, 0x10, 0xd7, 0xac, 0xf9, 0x7f, 0xb2, 0xf3,
+ 0x38, 0x45, 0x23, 0xe2, 0xd4, 0xd2, 0x63, 0x1c, 0x84, 0xde, 0x25, 0xd4,
+ 0x3c, 0x76, 0x58, 0x1a, 0xb6, 0x07, 0x22, 0x74, 0xc2, 0xf7, 0x2c, 0xe1,
+ 0xc0, 0x51, 0x8c, 0xfa, 0xde, 0x6b, 0x35, 0x8c, 0x0f, 0x45, 0xf8, 0x5e,
+ 0x61, 0x2d, 0x4e, 0x90, 0x2d, 0xb7, 0x6c, 0xaf, 0x71, 0x72, 0xdf, 0x68,
+ 0xa9, 0xa2, 0x36, 0x79, 0xbd, 0xee, 0x88, 0xb0, 0xc8, 0xc9, 0xa6, 0x7e,
+ 0x8e, 0xe8, 0x16, 0xbc, 0xd6, 0x82, 0x54, 0xac, 0x81, 0x42, 0x0f, 0xc9,
+ 0x38, 0xd2, 0xe1, 0x17, 0x17, 0x4f, 0xc9, 0x0c, 0x39, 0xc0, 0x70, 0xd8,
+ 0xd8, 0x17, 0x37, 0x4a, 0x93, 0x40, 0x83, 0xe3, 0x3f, 0x05, 0x25, 0xab,
+ 0x6e, 0x58, 0xc1, 0x30, 0x62, 0x4d, 0xad, 0xcd, 0x1b, 0x7a, 0x4b, 0x08,
+ 0xf8, 0x69, 0x85, 0xf1, 0x10, 0x84, 0x22, 0x54, 0x3a, 0x0c, 0x2d, 0x1b,
+ 0xcd, 0x2d, 0xed, 0x95, 0x63, 0x1a, 0x9e, 0xbc, 0xb8, 0x76, 0x48, 0x65,
+ 0xd1, 0xa6, 0x22, 0x98, 0x3e, 0xda, 0x00, 0x56, 0xf4, 0xd3, 0xc5, 0xb0,
+ 0xb3, 0xb0, 0xfa, 0x0c, 0x84, 0x43, 0xfb, 0xa1, 0x1a, 0xba, 0x23, 0xc6,
+ 0x72, 0xea, 0x83, 0x96, 0xff, 0xfd, 0x0d, 0xba, 0x40, 0x32, 0x3e, 0x1a,
+ 0x61, 0x7b, 0xd5, 0x50, 0xfe, 0x41, 0xc8, 0x67, 0x71, 0xb4, 0xff, 0x24,
+ 0xf8, 0x7b, 0xa2, 0x6d, 0x97, 0x84, 0x8e, 0x36, 0x30, 0x05, 0xc3, 0x60,
+ 0x3b, 0x1c, 0xee, 0x34, 0x57, 0x05, 0x0f, 0x9e, 0xc2, 0xfd, 0xc8, 0x03,
+ 0xab, 0x8a, 0x54, 0xde, 0x6a, 0x22, 0xa5, 0xb7, 0x38, 0xf5, 0x91, 0x08,
+ 0xd4, 0xce, 0xe3, 0xa7, 0xb4, 0xcb, 0x58, 0x79, 0xe2, 0x34, 0x79, 0xfa,
+ 0xc2, 0x85, 0x01, 0xeb, 0x53, 0xf1, 0xca, 0x5c, 0xa1, 0xfc, 0x35, 0xa2,
+ 0x7b, 0x8f, 0x29, 0x1c, 0x67, 0xb0, 0x01, 0x1b, 0x5a, 0xa1, 0xc9, 0x3b,
+ 0x2c, 0xc6, 0x35, 0xbb, 0x29, 0x46, 0x13, 0xfa, 0xd9, 0x40, 0x63, 0x3e,
+ 0x6c, 0xa2, 0x36, 0x70, 0xe7, 0xc8, 0x76, 0x55, 0x70, 0xd2, 0x3f, 0xd1,
+ 0xae, 0x83, 0x9d, 0xb9, 0x60, 0x47, 0x3e, 0x38, 0x0d, 0x08, 0x3f, 0xe0,
+ 0x6b, 0x16, 0x7f, 0x7d, 0x7d, 0x40, 0x98, 0x99, 0xc1, 0x27, 0xf2, 0xb5,
+ 0xfe, 0x33, 0xce, 0x83, 0x8c, 0x7d, 0xa7, 0xe6, 0xeb, 0x06, 0xdb, 0x4f,
+ 0xca, 0x10, 0x82, 0x7b, 0x5e, 0xe8, 0xa9, 0x2e, 0xe0, 0x7a, 0xc2, 0x03,
+ 0x75, 0x6e, 0x4e, 0x2b, 0xb6, 0xc3, 0x99, 0xf5, 0x41, 0xe9, 0x75, 0xe5,
+ 0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf5, 0xf5, 0x89, 0x60, 0xae, 0x41, 0x13,
+ 0x91, 0x77, 0x84, 0xb6, 0x79, 0xea, 0xcb, 0xeb, 0x8d, 0x05, 0xe2, 0x18,
+ 0xfd, 0x36, 0x1f, 0x68, 0x34, 0xd1, 0x3c, 0xc3, 0xe1, 0x87, 0xd3, 0x2a,
+ 0xb1, 0xc5, 0xac, 0xe2, 0xc3, 0xaf, 0xd1, 0x53, 0x61, 0x5e, 0xba, 0xcb,
+ 0x32, 0xde, 0x97, 0xee, 0x4e, 0x58, 0xda, 0xda, 0x9d, 0x12, 0xe2, 0x75,
+ 0x20, 0xd5, 0xb4, 0x64, 0x82, 0x75, 0x3e, 0xee, 0xb9, 0x13, 0x54, 0x54,
+ 0x95, 0x36, 0x36, 0xa9, 0x85, 0x34, 0xa2, 0x37, 0xa0, 0x55, 0xe7, 0x1e,
+ 0x9e, 0xb8, 0xbf, 0x36, 0x96, 0x1b, 0x1c, 0xa9, 0x16, 0xa9, 0x66, 0xb6,
+ 0x30, 0x91, 0xc6, 0xfb, 0x51, 0x30, 0xc8, 0x19, 0x91, 0xca, 0x9e, 0x99,
+ 0x88, 0x5a, 0x29, 0xbc, 0x10, 0x8e, 0x21, 0x93, 0x4b, 0xd1, 0x10, 0x10,
+ 0x10, 0xca, 0x1a, 0x4d, 0x95, 0xd5, 0x0a, 0x08, 0xe4, 0xbc, 0xbc, 0xd4,
+ 0xc4, 0x48, 0xaa, 0xb7, 0x55, 0x88, 0x55, 0x59, 0xfa, 0x05, 0x17, 0xae,
+ 0x2f, 0xcd, 0xa5, 0x86, 0xc7, 0x2a, 0x45, 0xaa, 0x59, 0xad, 0x8c, 0x24,
+ 0x71, 0xbe, 0xd4, 0x4c, 0x32, 0x06, 0x64, 0x72, 0xa7, 0xa6, 0x62, 0x16,
+ 0x8a, 0x6f, 0x04, 0x23, 0x88, 0x64, 0xd2, 0xf4, 0x44, 0x04, 0x04, 0x32,
+ 0x86, 0x93, 0x65, 0x75, 0x42, 0x82, 0x39, 0x2f, 0x2f, 0x35, 0x31, 0x12,
+ 0x2a, 0xad, 0xd5, 0x62, 0x15, 0x56, 0x7e, 0x81, 0x48, 0x8e, 0xd3, 0x5e,
+ 0x73, 0x9d, 0xa3, 0xec, 0xca, 0xdd, 0xbe, 0x89, 0xd7, 0xb8, 0xa3, 0x59,
+ 0xeb, 0x97, 0xb3, 0xf2, 0xf1, 0xa6, 0x4b, 0x8e, 0x89, 0xe6, 0xe9, 0x0a,
+ 0x84, 0x9b, 0xbf, 0xd3, 0x6b, 0xd5, 0xbf, 0x1e, 0x7f, 0x87, 0x55, 0x76,
+ 0x5e, 0xa7, 0xe6, 0x3e, 0xcf, 0x6c, 0x16, 0x5f, 0xf1, 0xf6, 0xf0, 0x3e,
+ 0xd4, 0x4f, 0x71, 0xe5, 0x23, 0x8c, 0xf6, 0xa6, 0x11, 0xc3, 0xf8, 0x7b,
+ 0xc7, 0xea, 0x1a, 0x6a, 0xc7, 0x13, 0x2e, 0x5a, 0xf6, 0x61, 0x9b, 0x71,
+ 0x61, 0x3b, 0x66, 0x37, 0xd4, 0x28, 0xa6, 0xbf, 0xd6, 0xc6, 0x2e, 0x29,
+ 0xd6, 0x38, 0xb5, 0x9c, 0x58, 0x75, 0xfa, 0x2a, 0x6c, 0x2f, 0xa3, 0x8b,
+ 0x02, 0xbe, 0xdd, 0x38, 0xdb, 0x4f, 0xca, 0x25, 0x43, 0x09, 0x44, 0x79,
+ 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x45, 0xaa, 0x53, 0x29, 0x8e, 0xd7, 0x81,
+ 0x74, 0xdd, 0xfa, 0x65, 0x18, 0xd5, 0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf6,
+ 0x04, 0xf5, 0xcd, 0xd8, 0xa0, 0x26, 0xb4, 0x41, 0xe3, 0x02, 0xc9, 0x95,
+ 0xfe, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0xe6, 0x35, 0xff, 0x03,
+ 0x5f, 0x8c, 0xac, 0x56, 0x1e, 0xec, 0x29, 0xfc, 0x45, 0x97, 0x61, 0x74,
+ 0xa6, 0xed, 0x7c, 0x67, 0x7a, 0xf5, 0xdd, 0x80, 0xaf, 0x42, 0x04, 0x7f,
+ 0x82, 0x46, 0x15, 0x56, 0xea, 0xb1, 0x0a, 0xab, 0x3f, 0x40, 0xa4, 0x47,
+ 0x69, 0xaf, 0x39, 0xce, 0xd1, 0xf6, 0x65, 0x6e, 0xf0, 0x45, 0x5e, 0xfc,
+ 0x51, 0xac, 0xf5, 0xcb, 0xd9, 0xf9, 0x78, 0xd3, 0x25, 0xc7, 0x44, 0xf3,
+ 0x74, 0x85, 0x42, 0x4d, 0xdf, 0xe9, 0xb5, 0xea, 0xdf, 0x8f, 0x3f, 0xc3,
+ 0xaa, 0xbb, 0x2f, 0x53, 0xf3, 0x1f, 0x67, 0xb6, 0x0b, 0x2f, 0xf8, 0xfb,
+ 0x78, 0x1f, 0x6a, 0x27, 0xb8, 0xf2, 0x91, 0xc6, 0x7b, 0x53, 0x08, 0xe1,
+ 0xfc, 0x3d, 0xe3, 0xf5, 0x0d, 0x35, 0x63, 0x89, 0x97, 0x2d, 0x7b, 0x30,
+ 0xcd, 0xb8, 0xb0, 0x9d, 0xb3, 0x1b, 0xea, 0x14, 0x53, 0x5f, 0xeb, 0x63,
+ 0x17, 0x14, 0xeb, 0x1c, 0x5a, 0xce, 0x2c, 0x3a, 0xfd, 0x15, 0x36, 0x17,
+ 0xd1, 0xc5, 0x81, 0x5f, 0x6e, 0x9c, 0x6d, 0xa7, 0xe5, 0x12, 0xa1, 0x84,
+ 0xa2, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x22, 0xd5, 0x29, 0x94, 0xc7,
+ 0x6b, 0xc0, 0xba, 0x6e, 0xfd, 0x32, 0x8c, 0x6a, 0xe2, 0xd7, 0x27, 0xd4,
+ 0x2b, 0xfb, 0x02, 0x7a, 0xe6, 0xec, 0x50, 0x13, 0x5a, 0x20, 0xf1, 0x81,
+ 0x64, 0xca, 0xff, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x73, 0x1a,
+ 0xff, 0x81, 0xaf, 0xc6, 0x56, 0x2b, 0x0f, 0x76, 0x14, 0xfe, 0x22, 0xcb,
+ 0xb0, 0xba, 0x53, 0x76, 0xbe, 0x33, 0xbd, 0x7a, 0xee, 0xc0, 0x57, 0xa1,
+ 0x02, 0x3f, 0xc1, 0x23, 0x0a, 0xab, 0x75, 0x58, 0x85, 0x55, 0x9f, 0xa0,
+ 0x52, 0x23, 0xb4, 0xd7, 0x9c, 0xe7, 0x68, 0xfb, 0x32, 0xb7, 0x78, 0x22,
+ 0xaf, 0x7e, 0x28, 0xd6, 0x7a, 0xe5, 0xec, 0xfc, 0xbc, 0x69, 0x92, 0xe3,
+ 0xa2, 0x79, 0xba, 0x42, 0xa1, 0x26, 0xef, 0xf4, 0xda, 0xf5, 0x6f, 0xc7,
+ 0x9f, 0xe1, 0xd5, 0x5d, 0x97, 0xa9, 0xf9, 0x8f, 0xb3, 0xdb, 0x05, 0x97,
+ 0xfc, 0x7d, 0xbc, 0x0f, 0xb5, 0x13, 0xdc, 0x79, 0x48, 0xe3, 0x3d, 0xa9,
+ 0x84, 0x70, 0xfe, 0x1e, 0xf1, 0xfa, 0x86, 0x9a, 0xb1, 0xc4, 0xcb, 0x96,
+ 0xbd, 0x98, 0x66, 0xdc, 0x58, 0x4e, 0xd9, 0x8d, 0xf5, 0x0a, 0x29, 0xaf,
+ 0xf5, 0xb1, 0x8b, 0x8a, 0x75, 0x8e, 0x2d, 0x67, 0x16, 0x1d, 0x7e, 0x8a,
+ 0x9b, 0x0b, 0xe8, 0xe2, 0xc0, 0xaf, 0xb7, 0x4e, 0x36, 0xd3, 0xf2, 0x89,
+ 0x50, 0xc2, 0x51, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x91, 0x6a, 0x94,
+ 0xca, 0x63, 0xb5, 0xe0, 0x5d, 0x37, 0x7e, 0x99, 0x46, 0x35, 0x71, 0x6b,
+ 0x93, 0xea, 0x15, 0xfd, 0x81, 0x3d, 0x73, 0x76, 0x28, 0x09, 0xad, 0x10,
+ 0x78, 0xc0, 0xb2, 0x65, 0x7f, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70,
+ 0x39, 0x8d, 0x7f, 0xc0, 0xd7, 0xe3, 0x2b, 0x15, 0x87, 0xbb, 0x0a, 0x7f,
+ 0x11, 0x65, 0xd8, 0x5d, 0x29, 0xbb, 0x5f, 0x19, 0xde, 0xbd, 0x77, 0x60,
+ 0x2b, 0xd0, 0x81, 0x1f, 0xe0, 0x91, 0x85, 0x55, 0xba, 0xac, 0x42, 0xaa,
+ 0xcf, 0xd0, 0x29, 0x11, 0xda, 0x6b, 0xce, 0x73, 0xb4, 0x7d, 0x99, 0x5b,
+ 0xbc, 0x11, 0x57, 0xbf, 0x14, 0x6b, 0x3d, 0x72, 0xf6, 0x7e, 0x5e, 0x34,
+ 0xc9, 0x71, 0xd1, 0x3c, 0xdd, 0x21, 0x50, 0x93, 0x77, 0xfa, 0x6d, 0x7a,
+ 0xb7, 0xe3, 0xcf, 0xf0, 0xea, 0xae, 0xe7, 0x1d, 0xfb, 0x2a, 0x2f, 0x0e,
+ 0xe3, 0xde, 0xf4, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+ 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+ 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+ 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+ 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+ 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+ 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+ 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+ 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+ 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+ 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+ 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+ 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+ 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+ 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+ 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+ 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+ 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+ 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+ 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+ 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+ 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+ 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+ 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+ 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+ 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+ 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+ 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+ 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+ 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+ 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+ 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+ 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+ 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+ 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+ 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+ 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+ 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+ 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+ 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+ 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+ 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+ 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+ 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+ 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+ 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+ 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+ 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+ 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+ 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+ 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+ 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+ 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+ 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+ 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+ 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+ 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+ 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+ 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+ 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+ 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+ 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+ 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+ 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+ 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+ 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+ 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+ 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+ 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+ 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+ 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+ 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+ 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+ 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+ 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+ 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+ 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+ 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+ 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+ 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+ 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+ 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+ 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+ 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+ 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+ 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+ 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+ 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+ 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+ 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+ 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+ 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+ 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+ 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+ 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+ 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+ 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+ 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+ 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+ 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+ 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+ 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+ 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+ 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+ 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+ 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+ 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+ 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+ 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+ 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+ 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+ 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+ 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+ 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+ 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+ 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+ 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+ 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+ 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+ 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+ 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+ 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+ 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+ 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+ 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+ 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+ 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+ 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+ 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+ 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+ 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+ 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+ 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+ 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+ 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+ 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+ 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+ 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+ 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+ 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+ 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+ 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+ 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+ 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+ 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+ 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+ 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+ 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+ 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+ 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+ 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+ 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+ 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+ 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+ 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+ 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+ 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+ 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+ 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+ 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+ 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+ 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+ 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+ 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+ 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+ 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+ 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+ 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+ 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+ 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+ 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+ 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+ 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+ 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+ 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+ 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+ 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+ 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+ 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+ 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+ 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+ 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+ 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+ 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+ 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+ 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+ 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+ 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+ 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+ 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+ 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+ 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+ 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+ 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+ 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+ 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+ 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+ 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+ 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+ 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+ 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+ 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+ 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+ 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+ 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+ 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+ 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+ 0x23, 0xb3,
+};
+static_assert(sizeof(kBytesTestReadSymbol7) == kNumBytesTestReadSymbol7, "");
+
+// The kBytesTestReadSymbol8[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][9] = {
+// // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+// { 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+// 32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0 },
+// // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+// { 32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+// 32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0 },
+// // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+// { 32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288,
+// 32768 - 16384, 32768 - 20480, 32768 - 26624, 0, 0 },
+// // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+// { 32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+// 32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0 },
+// };
+// constexpr int kSymbols[16][4] = { { 0, 4, 7, 3 }, //
+// { 1, 5, 6, 2 }, //
+// { 2, 6, 5, 1 }, //
+// { 3, 7, 4, 0 }, //
+// { 4, 0, 3, 7 }, //
+// { 5, 1, 2, 6 }, //
+// { 6, 2, 1, 5 }, //
+// { 7, 3, 0, 4 }, //
+// { 0, 0, 6, 5 }, //
+// { 2, 1, 4, 3 }, //
+// { 4, 3, 6, 4 }, //
+// { 6, 5, 2, 2 }, //
+// { 1, 0, 7, 3 }, //
+// { 3, 2, 5, 5 }, //
+// { 5, 4, 7, 2 }, //
+// { 7, 6, 3, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+// for (int j = 0; j < 16; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 8);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf(" constexpr size_t kNumBytesTestReadSymbol8 = %u;\n", bw.pos);
+// printf(" constexpr uint8_t kBytesTestReadSymbol8[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n };\n");
+
+constexpr size_t kNumBytesTestReadSymbol8 = 24195;
+constexpr uint8_t kBytesTestReadSymbol8[] = {
+ 0x15, 0x60, 0xa8, 0x52, 0xf4, 0x88, 0xdd, 0x23, 0x40, 0xb1, 0xd6, 0xd2,
+ 0xc2, 0xa2, 0x4c, 0x0a, 0x5d, 0xba, 0xfe, 0xd2, 0x36, 0xd9, 0xcd, 0x51,
+ 0x10, 0x25, 0x13, 0x29, 0xfa, 0x0d, 0x87, 0xf9, 0xd1, 0x6f, 0xf2, 0x0d,
+ 0x3a, 0xbe, 0xd9, 0x83, 0x99, 0xd1, 0xdf, 0x24, 0x70, 0x28, 0xdb, 0x63,
+ 0xf6, 0x7c, 0x07, 0x2b, 0x68, 0xa3, 0x7a, 0x85, 0xd1, 0x47, 0xba, 0x59,
+ 0x18, 0x7e, 0x64, 0x3b, 0xac, 0xaf, 0xe3, 0x3a, 0x99, 0x82, 0x30, 0x92,
+ 0x7a, 0x93, 0x67, 0x9f, 0xac, 0x53, 0xf8, 0xdb, 0x03, 0x71, 0xc7, 0x4a,
+ 0xa9, 0xec, 0x10, 0xc9, 0xed, 0x5b, 0xa6, 0xd5, 0xc3, 0xdd, 0x81, 0x8d,
+ 0x25, 0xbe, 0x57, 0xcd, 0x01, 0x65, 0x33, 0x6c, 0x12, 0xe1, 0x37, 0x8b,
+ 0xf1, 0x08, 0x27, 0x3c, 0x5a, 0x30, 0x9f, 0x2d, 0x41, 0x2e, 0x75, 0x49,
+ 0xab, 0xa6, 0xb6, 0x4c, 0xbe, 0xe0, 0xd0, 0x20, 0x74, 0xeb, 0x05, 0x79,
+ 0x91, 0x60, 0xfd, 0xb2, 0x39, 0x54, 0xd9, 0x0c, 0x11, 0x04, 0x1f, 0x7b,
+ 0x5d, 0x2d, 0xe3, 0x3f, 0x48, 0xe4, 0x56, 0x11, 0x3d, 0x48, 0xdb, 0x5c,
+ 0x1c, 0x8b, 0x81, 0xbb, 0x8a, 0x53, 0xb7, 0x48, 0x5b, 0x15, 0x9b, 0x35,
+ 0xc1, 0x18, 0x0f, 0xc3, 0x1e, 0x1c, 0x16, 0x7e, 0x0a, 0xbf, 0x16, 0x0a,
+ 0xf5, 0x3f, 0xbe, 0x19, 0xc0, 0x0f, 0xa4, 0x59, 0xae, 0x0a, 0xcf, 0xf4,
+ 0x00, 0xb2, 0xff, 0x3a, 0xd8, 0x7f, 0x6c, 0xcf, 0x4f, 0xca, 0xa1, 0x40,
+ 0x47, 0x8e, 0xd0, 0x44, 0x49, 0x5a, 0x48, 0xe6, 0x86, 0x80, 0xbb, 0x57,
+ 0x36, 0x6e, 0x80, 0xf1, 0xd1, 0xd8, 0xb8, 0xad, 0xb7, 0x6b, 0x11, 0x79,
+ 0x02, 0x95, 0x20, 0xcf, 0x6f, 0x21, 0xe6, 0x5c, 0x65, 0x69, 0x4a, 0xf2,
+ 0x6f, 0x87, 0x68, 0xf1, 0xda, 0x3b, 0xe1, 0x64, 0x5c, 0xfc, 0x21, 0x02,
+ 0x7b, 0xf6, 0x39, 0x77, 0x36, 0x29, 0x3d, 0xda, 0x16, 0x2e, 0xdb, 0x55,
+ 0xac, 0x5a, 0x3a, 0x94, 0x9c, 0x79, 0x2c, 0x92, 0xa4, 0xe3, 0xe2, 0x87,
+ 0xd8, 0x14, 0x21, 0x76, 0xae, 0xf1, 0x8d, 0x7d, 0xdc, 0xde, 0x46, 0xd9,
+ 0xbd, 0xb6, 0x5f, 0xae, 0x77, 0xd0, 0xd7, 0x01, 0xed, 0xbe, 0x5f, 0xee,
+ 0x1a, 0x20, 0x0f, 0x88, 0x5c, 0x8a, 0x44, 0xad, 0x8f, 0x8f, 0x66, 0x9d,
+ 0x43, 0xf4, 0x41, 0x0a, 0xa1, 0xc8, 0x5c, 0xbc, 0x37, 0xe2, 0xca, 0xd2,
+ 0xd8, 0x27, 0x54, 0xdb, 0xdf, 0x7f, 0x0a, 0xd7, 0x65, 0x19, 0x99, 0x1a,
+ 0x92, 0x53, 0xdd, 0x1e, 0x5f, 0xad, 0x24, 0x8a, 0x8d, 0x76, 0xc4, 0xf7,
+ 0x7e, 0x74, 0xfe, 0x68, 0x99, 0x42, 0xfa, 0xaa, 0x6e, 0xdd, 0x91, 0xd4,
+ 0x71, 0x10, 0xb7, 0x45, 0xa8, 0x5f, 0x84, 0x0d, 0xeb, 0x38, 0x3e, 0xaa,
+ 0xf1, 0xad, 0x86, 0x8f, 0x1a, 0x3e, 0x9a, 0x29, 0xc7, 0x7b, 0xa7, 0xdf,
+ 0x51, 0x3d, 0x49, 0x08, 0x09, 0x69, 0x40, 0x9d, 0x45, 0xb8, 0x55, 0xce,
+ 0x96, 0x6c, 0x8b, 0xc6, 0xc9, 0x25, 0x70, 0xc9, 0xb3, 0xa8, 0xa8, 0x08,
+ 0x33, 0x7b, 0xca, 0x21, 0x9e, 0x5b, 0xb5, 0x02, 0x7f, 0xa3, 0x34, 0x7c,
+ 0x3d, 0xba, 0x91, 0x2e, 0xae, 0xc3, 0x1f, 0x9e, 0xc2, 0x4f, 0xdf, 0xa9,
+ 0x39, 0x9b, 0x9d, 0x6e, 0xc7, 0x90, 0xeb, 0x2b, 0xb0, 0x3f, 0xde, 0x37,
+ 0xb7, 0x94, 0x3d, 0x4b, 0x2c, 0x42, 0x3f, 0x47, 0xad, 0xc9, 0x23, 0xcb,
+ 0x4d, 0xc4, 0xdd, 0x5e, 0x67, 0x11, 0x9d, 0x45, 0xb8, 0x55, 0xce, 0x98,
+ 0x05, 0xce, 0x97, 0x99, 0x57, 0x84, 0x8d, 0x79, 0x97, 0x81, 0x4b, 0x8a,
+ 0x9c, 0x76, 0x73, 0x9a, 0xf7, 0x59, 0x54, 0x07, 0x6c, 0x11, 0x41, 0x44,
+ 0xf0, 0xa6, 0x2a, 0x5e, 0xb1, 0x48, 0x47, 0x39, 0xbb, 0x1b, 0xf0, 0x25,
+ 0x07, 0xe7, 0xd2, 0xbb, 0x9b, 0x9b, 0xd7, 0x7e, 0xc8, 0xdd, 0xae, 0xb6,
+ 0x23, 0x5e, 0xe0, 0xa5, 0xb0, 0xc6, 0xb6, 0x81, 0xe9, 0x51, 0x20, 0xe9,
+ 0x2f, 0x89, 0xcd, 0x13, 0x96, 0x21, 0x19, 0xc5, 0xd1, 0x65, 0x65, 0x88,
+ 0xd9, 0x7b, 0x87, 0xdc, 0xfb, 0x38, 0x54, 0x22, 0x27, 0xc4, 0xc4, 0x16,
+ 0x56, 0xff, 0x76, 0x69, 0xa6, 0x3b, 0xa0, 0x6d, 0xab, 0xb8, 0xdf, 0xc1,
+ 0xc2, 0xff, 0x65, 0x8f, 0x85, 0xbc, 0x69, 0xc0, 0xa5, 0x9a, 0xef, 0xf1,
+ 0x37, 0x57, 0x99, 0xc4, 0x67, 0x51, 0x6e, 0xdf, 0x30, 0xa4, 0x86, 0x47,
+ 0x34, 0x5f, 0x5e, 0x3c, 0xde, 0x6e, 0x96, 0x74, 0x5c, 0xbd, 0xca, 0xa3,
+ 0x50, 0xe4, 0xe8, 0x63, 0xdf, 0xb0, 0xf1, 0xbe, 0xa2, 0x58, 0x23, 0x7a,
+ 0x4a, 0x29, 0x62, 0x1f, 0x03, 0xf1, 0xe9, 0x19, 0xdd, 0x68, 0xe8, 0x1a,
+ 0x7a, 0x9b, 0x40, 0x0d, 0xb0, 0x15, 0x8b, 0x14, 0x63, 0x08, 0xa4, 0x21,
+ 0xa6, 0x0b, 0x34, 0x8a, 0x3e, 0x76, 0x7a, 0xa8, 0x11, 0x81, 0x16, 0x12,
+ 0xa5, 0xc6, 0x7a, 0xf1, 0xa0, 0x20, 0xff, 0x33, 0x3b, 0xa5, 0x43, 0xc7,
+ 0x42, 0xd3, 0x22, 0x90, 0x16, 0xa2, 0x28, 0x18, 0xa4, 0xc7, 0x24, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+ 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+ 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+ 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+ 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+ 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+ 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+ 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+ 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+ 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+ 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+ 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+ 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+ 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+ 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+ 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+ 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+ 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+ 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+ 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+ 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+ 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+ 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+ 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+ 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+ 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+ 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+ 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+ 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+ 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+ 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+ 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+ 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+ 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+ 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+ 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+ 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+ 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+ 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+ 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+ 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+ 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+ 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+ 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+ 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+ 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+ 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+ 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+ 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+ 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+ 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+ 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+ 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+ 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+ 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+ 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+ 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+ 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+ 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+ 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+ 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+ 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+ 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+ 0x6f, 0xd0, 0xc0,
+};
+static_assert(sizeof(kBytesTestReadSymbol8) == kNumBytesTestReadSymbol8, "");
+
+// The kBytesTestReadSymbol9[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][10] = {
+// // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+// { 32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564, 32768 - 18204,
+// 32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0 },
+// // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+// { 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384, 32768 - 20025,
+// 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+// // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+// { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+// 32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0 },
+// // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+// { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+// 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+// };
+// constexpr int kSymbols[18][4] = { { 0, 4, 8, 3 }, //
+// { 1, 5, 7, 2 }, //
+// { 2, 6, 6, 1 }, //
+// { 3, 7, 5, 0 }, //
+// { 4, 8, 4, 8 }, //
+// { 5, 0, 3, 7 }, //
+// { 6, 1, 2, 6 }, //
+// { 7, 2, 1, 5 }, //
+// { 8, 3, 0, 4 }, //
+// { 0, 0, 8, 7 }, //
+// { 2, 1, 6, 5 }, //
+// { 4, 3, 4, 3 }, //
+// { 6, 5, 2, 1 }, //
+// { 8, 7, 7, 6 }, //
+// { 1, 0, 5, 4 }, //
+// { 3, 2, 3, 2 }, //
+// { 5, 4, 1, 4 }, //
+// { 7, 6, 8, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 128; ++i) {
+// for (int j = 0; j < 18; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 9);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol9 = 3650;
+constexpr uint8_t kBytesTestReadSymbol9[] = {
+ 0x10, 0xe6, 0x62, 0x17, 0x4c, 0x5e, 0xe0, 0x8c, 0x41, 0x75, 0x38, 0xda,
+ 0xb6, 0x33, 0xc7, 0x0e, 0x0f, 0x62, 0x87, 0x29, 0xbe, 0x28, 0x8b, 0x81,
+ 0x71, 0xab, 0x0d, 0xfe, 0x61, 0xf9, 0x96, 0x85, 0xfe, 0x78, 0x18, 0xe6,
+ 0x57, 0xa7, 0xf0, 0xd3, 0xd5, 0x62, 0x37, 0x9a, 0x3d, 0xc4, 0xad, 0x75,
+ 0x35, 0xc1, 0xe9, 0x63, 0xeb, 0x9c, 0xd3, 0xf4, 0xdb, 0xc0, 0xf3, 0x67,
+ 0x14, 0xbd, 0xde, 0xf7, 0xd1, 0x51, 0xf1, 0x62, 0x28, 0xd5, 0x39, 0x99,
+ 0x82, 0x5b, 0x9c, 0x3a, 0x37, 0x85, 0xe7, 0x48, 0x28, 0x02, 0x2d, 0xf1,
+ 0x15, 0x55, 0x77, 0x02, 0x2e, 0x62, 0x53, 0xf6, 0x8a, 0x53, 0x44, 0xfa,
+ 0xe0, 0xff, 0x05, 0xae, 0xdc, 0x30, 0xee, 0x36, 0x29, 0x80, 0xd5, 0x0a,
+ 0xa6, 0x5f, 0x53, 0xa2, 0x31, 0xc0, 0x5b, 0x2a, 0xa5, 0xa5, 0xd2, 0xc0,
+ 0x8d, 0x96, 0x66, 0x25, 0x93, 0x9e, 0xdc, 0x0b, 0x2f, 0xea, 0xe2, 0x51,
+ 0x0b, 0x12, 0x87, 0x90, 0x79, 0xe7, 0x8e, 0x6f, 0xc6, 0x99, 0x4b, 0x6a,
+ 0x50, 0x06, 0xf3, 0x3d, 0xf5, 0x25, 0x72, 0xc5, 0x9e, 0xab, 0x7b, 0x5b,
+ 0x15, 0xf5, 0xeb, 0xae, 0x02, 0xe4, 0x90, 0x2b, 0x15, 0x66, 0xf7, 0x50,
+ 0xfa, 0x46, 0x74, 0xae, 0xd4, 0x7f, 0xd4, 0x0b, 0xbf, 0xbc, 0x83, 0x60,
+ 0x6f, 0x25, 0x87, 0xde, 0xce, 0xb3, 0x86, 0x5a, 0x13, 0x00, 0x31, 0xf2,
+ 0x75, 0xca, 0x08, 0x71, 0xd2, 0xf4, 0xa9, 0xf9, 0x40, 0x23, 0xa7, 0x5e,
+ 0x50, 0x63, 0x64, 0x1d, 0xa2, 0x50, 0x2f, 0x01, 0x4c, 0x11, 0x8b, 0xcb,
+ 0x92, 0x40, 0x9d, 0x94, 0x50, 0x0a, 0xf5, 0x3b, 0xfc, 0x32, 0x1a, 0xbd,
+ 0x48, 0x73, 0xe7, 0x93, 0x0f, 0x53, 0xb2, 0x8e, 0xac, 0xef, 0x22, 0x2f,
+ 0x3e, 0xb0, 0x81, 0xc0, 0x06, 0x9b, 0x14, 0x5c, 0xa6, 0x16, 0xca, 0xa5,
+ 0x79, 0xd2, 0x6a, 0xd3, 0xfe, 0x93, 0x33, 0x2f, 0xdb, 0xcb, 0xca, 0xb3,
+ 0x1d, 0xc5, 0x56, 0x65, 0x53, 0x7f, 0xb9, 0x41, 0xe1, 0x54, 0x31, 0xa2,
+ 0x8c, 0x92, 0xc8, 0x04, 0xf7, 0x9d, 0x26, 0xad, 0x35, 0x00, 0x5a, 0xb2,
+ 0x78, 0x43, 0x14, 0xc2, 0xeb, 0x3a, 0x26, 0x4d, 0x49, 0x5d, 0x33, 0xe4,
+ 0xa9, 0xea, 0xd3, 0x67, 0xbf, 0xbc, 0xb6, 0x2e, 0x1c, 0xf7, 0xd0, 0x98,
+ 0x13, 0x0d, 0x7c, 0x94, 0x02, 0x28, 0x3e, 0x8a, 0xe5, 0x0c, 0x75, 0x82,
+ 0xe5, 0x81, 0x98, 0x87, 0x88, 0x97, 0x86, 0xd6, 0x46, 0x2c, 0x9c, 0x85,
+ 0xc2, 0x99, 0xfd, 0x0a, 0x68, 0xbf, 0x67, 0xfc, 0x17, 0xc7, 0x11, 0x54,
+ 0xd1, 0x20, 0x9d, 0x83, 0x52, 0x84, 0x5d, 0x4b, 0x62, 0xbf, 0x16, 0x5d,
+ 0x8e, 0x72, 0x46, 0xde, 0xb1, 0x77, 0xfb, 0x39, 0x98, 0xf0, 0x4d, 0xa6,
+ 0x7a, 0x7d, 0x1c, 0x16, 0xe9, 0x1e, 0x86, 0x7e, 0xf9, 0x22, 0x58, 0x93,
+ 0xea, 0x2e, 0x26, 0xc7, 0xfb, 0xd1, 0xb3, 0xc7, 0x99, 0xb1, 0x91, 0x67,
+ 0xf1, 0xa3, 0xe0, 0xd2, 0xe8, 0x17, 0x17, 0xd7, 0x0b, 0x7a, 0xd4, 0xed,
+ 0x9e, 0x72, 0x4e, 0xa2, 0x37, 0xc9, 0xd2, 0x16, 0x5d, 0x8b, 0xda, 0xdb,
+ 0x5c, 0x46, 0x05, 0x3e, 0xf7, 0xc8, 0x3a, 0xd5, 0xaf, 0xd9, 0x72, 0x82,
+ 0xbf, 0x96, 0xea, 0x09, 0xd3, 0xd5, 0xfe, 0x43, 0x24, 0xae, 0x95, 0x3d,
+ 0x6c, 0x68, 0x54, 0xad, 0xb5, 0xc4, 0x60, 0x54, 0x08, 0x3c, 0x57, 0x61,
+ 0xa1, 0x11, 0x21, 0x7f, 0xca, 0x48, 0x59, 0xb4, 0x1c, 0x39, 0x0d, 0xf2,
+ 0xdc, 0x62, 0xf0, 0xbb, 0x95, 0x39, 0x51, 0xe9, 0xdb, 0xf1, 0x5d, 0xd1,
+ 0x43, 0x83, 0x8a, 0xb1, 0x8d, 0x36, 0x39, 0x83, 0xc6, 0x94, 0x30, 0xbe,
+ 0xb6, 0x2f, 0x39, 0x05, 0xad, 0xcd, 0xf9, 0x4c, 0xc2, 0x34, 0xc7, 0x81,
+ 0x68, 0xb1, 0x20, 0x1d, 0xea, 0xd3, 0x8c, 0xca, 0xff, 0x4d, 0x94, 0xe1,
+ 0x3e, 0xc2, 0x74, 0x90, 0xed, 0x56, 0x3c, 0x1b, 0x5b, 0xf6, 0x40, 0xf9,
+ 0x3b, 0x94, 0x94, 0x23, 0xc6, 0x48, 0x6a, 0x59, 0xef, 0x04, 0xb7, 0x9f,
+ 0x55, 0x9c, 0x6f, 0x81, 0x73, 0xec, 0x27, 0x49, 0x0e, 0xd5, 0x63, 0xc1,
+ 0xb5, 0xbf, 0x64, 0x0f, 0x93, 0xb9, 0x49, 0x42, 0x3c, 0x64, 0x86, 0xa5,
+ 0x9e, 0xf0, 0x4b, 0x79, 0xf5, 0x59, 0xc7, 0xc5, 0x01, 0x6f, 0xbd, 0x6a,
+ 0x66, 0x93, 0x99, 0x47, 0xb6, 0xf7, 0xfa, 0x21, 0x72, 0x81, 0x71, 0x40,
+ 0x36, 0x81, 0xde, 0x5d, 0xdf, 0xdf, 0x30, 0x53, 0x03, 0x70, 0xfb, 0xb2,
+ 0x2d, 0x37, 0xeb, 0x19, 0xbc, 0xd2, 0x90, 0x44, 0x25, 0x42, 0x06, 0x30,
+ 0xc8, 0xcf, 0x4b, 0x0a, 0x01, 0x13, 0x5e, 0x17, 0x91, 0xc7, 0xcb, 0x79,
+ 0xed, 0x06, 0x39, 0xc1, 0x2e, 0x92, 0x29, 0xf5, 0xff, 0x24, 0xe7, 0x2b,
+ 0x3f, 0x19, 0x35, 0x6b, 0x3d, 0x69, 0xa2, 0x19, 0x20, 0x53, 0xd4, 0xca,
+ 0x08, 0x35, 0x6e, 0xe0, 0x5a, 0x9a, 0x9d, 0x48, 0xf5, 0x20, 0x24, 0x20,
+ 0x33, 0x94, 0x6b, 0x33, 0xdd, 0x78, 0xbf, 0x62, 0xf1, 0x43, 0x08, 0x97,
+ 0x53, 0x98, 0xe4, 0x17, 0x27, 0xfc, 0xe8, 0xf1, 0xb8, 0x4c, 0xb3, 0x79,
+ 0xc8, 0x05, 0x21, 0x1b, 0xe8, 0x56, 0xd2, 0x5f, 0xb6, 0x90, 0x14, 0x0c,
+ 0x96, 0x38, 0xc6, 0xc3, 0x6d, 0x10, 0xbf, 0xc6, 0x28, 0xfe, 0x1f, 0x13,
+ 0x81, 0x04, 0xeb, 0x37, 0x9c, 0x80, 0x52, 0x47, 0x0f, 0xa0, 0x6e, 0xcd,
+ 0x9c, 0x44, 0xdd, 0x61, 0x9c, 0x8f, 0xb2, 0xf5, 0xe0, 0xa0, 0x2b, 0x2f,
+ 0xe7, 0x67, 0xd0, 0xd7, 0x29, 0x08, 0x72, 0xee, 0xd5, 0x60, 0xb9, 0xbb,
+ 0x1b, 0x12, 0xce, 0x60, 0x98, 0xb9, 0x40, 0xd3, 0xd9, 0x77, 0x5d, 0x6b,
+ 0x78, 0xaa, 0x9a, 0x47, 0x2a, 0xf5, 0x38, 0xbb, 0xbe, 0x3a, 0x82, 0x6a,
+ 0xbf, 0x8b, 0x67, 0x7e, 0xa4, 0x78, 0xbf, 0xcf, 0x58, 0xce, 0x86, 0x2e,
+ 0x34, 0xb7, 0x76, 0x99, 0xa5, 0xf1, 0x0c, 0xa9, 0x1c, 0x9f, 0xad, 0xcb,
+ 0xac, 0xf4, 0x03, 0x60, 0xe0, 0x22, 0xfe, 0x02, 0x34, 0x9a, 0x14, 0xb9,
+ 0x11, 0xea, 0x4c, 0x3a, 0x59, 0xaa, 0xec, 0x8f, 0x82, 0x49, 0x23, 0xa2,
+ 0xd0, 0xf7, 0xc3, 0xf0, 0xaa, 0x2d, 0xb2, 0xb8, 0xce, 0x02, 0x2f, 0xe0,
+ 0x23, 0x49, 0xa1, 0x38, 0x12, 0xba, 0xab, 0x9f, 0x60, 0xe4, 0x0d, 0xfa,
+ 0x2b, 0xcc, 0xad, 0x6a, 0x06, 0xca, 0x38, 0x82, 0xc5, 0x88, 0x10, 0xb6,
+ 0xf5, 0xf6, 0x06, 0x7b, 0x03, 0x9c, 0xe4, 0x89, 0xaf, 0xdb, 0x66, 0x45,
+ 0xeb, 0x2c, 0x28, 0xe2, 0x40, 0x08, 0x44, 0xe2, 0x8a, 0x91, 0x19, 0x04,
+ 0x29, 0x46, 0xa7, 0xb5, 0x78, 0xae, 0x05, 0xcc, 0x38, 0x9f, 0xd8, 0x58,
+ 0xc9, 0x79, 0xf9, 0xad, 0x77, 0x66, 0x49, 0x62, 0xef, 0x13, 0x72, 0xee,
+ 0xda, 0x37, 0xb5, 0xd7, 0xf1, 0x51, 0x5d, 0x16, 0x11, 0xf3, 0x91, 0xf2,
+ 0x13, 0x49, 0x09, 0x50, 0x15, 0xc6, 0x48, 0xe6, 0xe9, 0x4c, 0xf0, 0x06,
+ 0x14, 0x3f, 0xef, 0x46, 0x15, 0xaf, 0x96, 0x0d, 0x17, 0x51, 0x08, 0xf2,
+ 0xe1, 0xc9, 0xb9, 0x1d, 0x8d, 0x8f, 0x74, 0x25, 0x04, 0x1f, 0x2c, 0x62,
+ 0x67, 0xe4, 0x4b, 0xdc, 0x67, 0x39, 0x2c, 0x7d, 0x3a, 0x1e, 0x6f, 0x5b,
+ 0x0b, 0xab, 0x0b, 0x1f, 0x64, 0x37, 0x19, 0x4f, 0x6b, 0x07, 0x05, 0xff,
+ 0x6e, 0x89, 0x8f, 0x22, 0x7d, 0x28, 0xd9, 0x3b, 0x9a, 0xe2, 0x3f, 0xff,
+ 0xc2, 0xb1, 0xca, 0x05, 0xbc, 0x05, 0xa5, 0xe7, 0x2d, 0x66, 0xf7, 0x37,
+ 0x92, 0xd2, 0xb4, 0x35, 0x26, 0x3f, 0x8c, 0x0c, 0x22, 0xa5, 0x5f, 0x5e,
+ 0x9c, 0x01, 0x46, 0x91, 0xe7, 0xa2, 0x92, 0x97, 0x0a, 0x19, 0x85, 0x2f,
+ 0x54, 0xe3, 0xa8, 0x26, 0xab, 0xe6, 0xb5, 0xd9, 0x71, 0x19, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+ 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+ 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+ 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+ 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+ 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+ 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+ 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+ 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+ 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+ 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+ 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+ 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+ 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+ 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+ 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+ 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+ 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+ 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+ 0xc0, 0x98,
+};
+static_assert(sizeof(kBytesTestReadSymbol9) == kNumBytesTestReadSymbol9, "");
+
+// The kBytesTestReadSymbol10[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][11] = {
+// // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+// { 32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+// 32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0 },
+// // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+// { 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746, 32768 - 18022,
+// 32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+// // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+// { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+// 32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0 },
+// // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+// { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+// 32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+// };
+// constexpr int kSymbols[20][4] = { { 0, 5, 9, 4 }, //
+// { 1, 6, 8, 3 }, //
+// { 2, 7, 7, 2 }, //
+// { 3, 8, 6, 1 }, //
+// { 4, 9, 5, 0 }, //
+// { 5, 0, 4, 9 }, //
+// { 6, 1, 3, 8 }, //
+// { 7, 2, 2, 7 }, //
+// { 8, 3, 1, 6 }, //
+// { 9, 4, 0, 5 }, //
+// { 0, 0, 9, 7 }, //
+// { 2, 1, 8, 5 }, //
+// { 4, 3, 6, 3 }, //
+// { 6, 5, 4, 1 }, //
+// { 8, 7, 2, 8 }, //
+// { 1, 0, 9, 6 }, //
+// { 3, 2, 7, 4 }, //
+// { 5, 4, 5, 2 }, //
+// { 7, 6, 3, 5 }, //
+// { 9, 8, 1, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+// for (int j = 0; j < 20; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 10);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol10 = 3204;
+constexpr uint8_t kBytesTestReadSymbol10[] = {
+ 0x10, 0x84, 0xe2, 0xe0, 0x0f, 0x08, 0xd6, 0x01, 0xd0, 0xaa, 0xd8, 0xb5,
+ 0x60, 0x4f, 0xb9, 0xb3, 0x73, 0x01, 0x8c, 0x92, 0xe6, 0xa0, 0xab, 0xe8,
+ 0xe4, 0x95, 0x85, 0x03, 0x5f, 0xbb, 0x3b, 0x1f, 0x27, 0xb1, 0x44, 0x95,
+ 0x50, 0x1f, 0xad, 0xc8, 0x35, 0xde, 0x44, 0xf3, 0xb6, 0x8d, 0xa2, 0x39,
+ 0xc3, 0xb6, 0xee, 0x3c, 0x10, 0x33, 0x27, 0x7a, 0x29, 0xcc, 0x7c, 0x08,
+ 0xcb, 0x94, 0xbe, 0xef, 0x96, 0x47, 0x30, 0x49, 0x47, 0x9c, 0xb7, 0x7e,
+ 0x23, 0x0c, 0x27, 0x8e, 0x1b, 0xdc, 0x6c, 0x92, 0x40, 0x98, 0xbf, 0x20,
+ 0xd4, 0x01, 0x72, 0x55, 0x8c, 0x3f, 0x3c, 0x76, 0x24, 0xd2, 0x2d, 0xba,
+ 0xa4, 0x54, 0x29, 0x80, 0xe9, 0x06, 0x2c, 0x68, 0xbd, 0xa7, 0xc5, 0xf7,
+ 0x44, 0xdf, 0x7e, 0x94, 0x90, 0x3f, 0x94, 0x7d, 0x9e, 0x36, 0xb8, 0x82,
+ 0x1d, 0x4a, 0x47, 0x1f, 0x6c, 0x29, 0x51, 0xd2, 0x84, 0xa8, 0xcd, 0x98,
+ 0xc0, 0xd2, 0xea, 0x4a, 0x25, 0x3c, 0xd7, 0x34, 0x64, 0x96, 0xd4, 0x06,
+ 0xed, 0x00, 0x98, 0xc3, 0x65, 0x10, 0xd4, 0xac, 0x6b, 0xab, 0xd7, 0x35,
+ 0x04, 0x89, 0xbf, 0x24, 0xcc, 0xfc, 0xc9, 0xe8, 0x87, 0x3d, 0xdb, 0x55,
+ 0xf0, 0xc9, 0x97, 0x71, 0x99, 0x00, 0x54, 0x50, 0x24, 0x66, 0xca, 0x24,
+ 0xfd, 0x1c, 0xb1, 0x71, 0x0e, 0xb5, 0x9c, 0x27, 0xfc, 0x7f, 0x95, 0x98,
+ 0xc8, 0x99, 0x9f, 0x9b, 0xc7, 0xf6, 0x69, 0xfa, 0xb2, 0x11, 0x77, 0x8d,
+ 0x02, 0x53, 0x32, 0x4e, 0x20, 0x2c, 0x21, 0x2b, 0x99, 0x9a, 0xec, 0x63,
+ 0x0b, 0xe2, 0x8f, 0x30, 0xf8, 0x3c, 0xd1, 0xb1, 0xbc, 0x52, 0x73, 0xce,
+ 0x85, 0x54, 0xdd, 0xe6, 0xf6, 0x9c, 0x2d, 0xca, 0x3d, 0xa8, 0x09, 0x34,
+ 0xa8, 0x41, 0x9c, 0x03, 0x78, 0xbc, 0x67, 0x11, 0x9f, 0xbe, 0xde, 0x9a,
+ 0x98, 0x8a, 0x8d, 0x0b, 0x88, 0x7f, 0xea, 0x82, 0x77, 0x61, 0x7a, 0xde,
+ 0xb0, 0xb1, 0x46, 0x8d, 0x23, 0x69, 0x2f, 0x17, 0x05, 0xff, 0x4a, 0x9e,
+ 0xf9, 0xb3, 0x9a, 0xd0, 0xc4, 0x81, 0xcf, 0xbc, 0xe6, 0x26, 0x2c, 0x37,
+ 0x55, 0xec, 0xdc, 0x23, 0x05, 0xdf, 0x30, 0xcf, 0x5a, 0x4a, 0x0c, 0x08,
+ 0xc0, 0xd7, 0x9d, 0x80, 0xc0, 0xa3, 0x56, 0x49, 0x41, 0xc4, 0xdd, 0xc5,
+ 0x69, 0x5c, 0xe5, 0x6c, 0xc5, 0xae, 0x4c, 0x95, 0x45, 0xf2, 0xf6, 0xd6,
+ 0x12, 0x25, 0xcc, 0x24, 0x56, 0x8c, 0x2b, 0x32, 0x51, 0x18, 0x1a, 0xec,
+ 0xb0, 0x62, 0x40, 0x82, 0x59, 0xb8, 0x38, 0x9f, 0x9f, 0x73, 0xf5, 0xb3,
+ 0xc3, 0x93, 0xa5, 0x4e, 0xab, 0x7f, 0x97, 0x56, 0x51, 0xb0, 0xff, 0x69,
+ 0x73, 0xc2, 0xd0, 0x60, 0x93, 0x59, 0x2f, 0xc7, 0x84, 0x14, 0x7e, 0x68,
+ 0xa7, 0x2b, 0x37, 0xb4, 0x2e, 0x69, 0x58, 0x55, 0x3c, 0xd2, 0xf1, 0xa8,
+ 0x2b, 0x6e, 0xd5, 0x11, 0x1c, 0x1d, 0x17, 0xd5, 0xf1, 0xfa, 0x8b, 0xd1,
+ 0x6c, 0xc2, 0x32, 0x9e, 0x66, 0x3e, 0x6a, 0x4a, 0x0e, 0xb8, 0xf9, 0xa8,
+ 0x1c, 0x23, 0xb1, 0x7e, 0xe7, 0xa0, 0x27, 0x5b, 0x1e, 0x8f, 0x8a, 0xb1,
+ 0x1e, 0x50, 0x99, 0x9c, 0x39, 0x5b, 0xa0, 0x76, 0xa2, 0x90, 0x20, 0xd5,
+ 0x61, 0xf8, 0x96, 0x5a, 0xbc, 0x91, 0x5d, 0xfc, 0x1e, 0xed, 0xea, 0xd8,
+ 0x10, 0x5d, 0x15, 0xfa, 0x2b, 0xa7, 0x77, 0xaf, 0xae, 0x64, 0xef, 0x06,
+ 0xa4, 0xf7, 0x65, 0x58, 0xb8, 0x64, 0x47, 0xcd, 0xfa, 0x12, 0x8e, 0x7d,
+ 0x5b, 0x96, 0x27, 0xda, 0xb9, 0x2a, 0x14, 0xfe, 0x3e, 0x57, 0xd7, 0x4e,
+ 0x86, 0xb3, 0x36, 0xd7, 0x77, 0x2d, 0xf6, 0x1e, 0xf3, 0xfd, 0xdb, 0x9a,
+ 0x92, 0x78, 0x0a, 0xa4, 0x17, 0xf1, 0x78, 0xfc, 0xc3, 0x6d, 0xa0, 0xf8,
+ 0x07, 0x6a, 0x68, 0xb1, 0x1b, 0x00, 0x27, 0x65, 0x68, 0x76, 0x10, 0x39,
+ 0x4b, 0x8a, 0x51, 0x7a, 0x53, 0x69, 0x79, 0xfc, 0xbc, 0xe6, 0xf4, 0x26,
+ 0xc3, 0xbf, 0x3a, 0x64, 0x56, 0x7d, 0x5f, 0x76, 0xa2, 0x42, 0xd1, 0xad,
+ 0x3f, 0xb8, 0xce, 0xfb, 0x79, 0x38, 0xf3, 0x85, 0x2a, 0x67, 0xf4, 0x71,
+ 0xfe, 0x0b, 0x79, 0xee, 0x85, 0xe0, 0x61, 0x9c, 0x9d, 0xd5, 0xe0, 0x0a,
+ 0xd7, 0xa6, 0x21, 0xc3, 0x60, 0xbf, 0xbd, 0x16, 0xca, 0xa0, 0x16, 0x9d,
+ 0xc4, 0x14, 0x99, 0x03, 0x7e, 0xe6, 0x62, 0x6e, 0xbe, 0x18, 0x45, 0x5e,
+ 0x15, 0x42, 0xac, 0x5b, 0x60, 0x9f, 0xbd, 0x1e, 0x8a, 0x58, 0x55, 0x75,
+ 0xcf, 0xbb, 0x12, 0xcb, 0xc2, 0xf4, 0x01, 0xfc, 0x96, 0x8d, 0x97, 0x67,
+ 0x94, 0x65, 0x6b, 0xd0, 0xeb, 0xff, 0x26, 0x30, 0x3a, 0xa0, 0xe9, 0x9b,
+ 0xa7, 0x5e, 0x81, 0x2b, 0x8e, 0xf7, 0xd6, 0xbf, 0x6f, 0xe4, 0x33, 0xd5,
+ 0xaa, 0x5a, 0x27, 0x18, 0x24, 0x76, 0x72, 0x72, 0x50, 0x72, 0x92, 0x88,
+ 0x9f, 0x88, 0x81, 0x0f, 0x33, 0xa7, 0x99, 0x83, 0x53, 0x03, 0x8c, 0x2d,
+ 0x36, 0x43, 0x52, 0x27, 0x27, 0x74, 0xcd, 0xf1, 0x1b, 0x76, 0x95, 0x11,
+ 0xdf, 0x4e, 0xb3, 0xa5, 0x2e, 0xe4, 0xac, 0x3a, 0xfd, 0x9f, 0xab, 0x96,
+ 0x7e, 0xb1, 0xf0, 0x19, 0x22, 0xc4, 0x06, 0x9b, 0xe7, 0xe2, 0xf8, 0xb4,
+ 0x17, 0xbd, 0x9d, 0x14, 0xac, 0x11, 0xc9, 0x79, 0x8e, 0x01, 0x23, 0xc9,
+ 0x6e, 0x5f, 0x96, 0x1e, 0x99, 0xe1, 0x19, 0x2c, 0xb1, 0x1b, 0x54, 0x30,
+ 0x3a, 0xb1, 0xe7, 0xbf, 0xbf, 0x17, 0x3d, 0x9b, 0x86, 0xd7, 0x4b, 0x68,
+ 0x46, 0xa6, 0xb0, 0x05, 0x66, 0x4b, 0x8a, 0xdc, 0x60, 0x60, 0x29, 0x95,
+ 0x35, 0x4b, 0x6f, 0xf5, 0x73, 0x51, 0x52, 0xb6, 0xec, 0xef, 0x74, 0xcb,
+ 0x0b, 0x00, 0x04, 0x15, 0xff, 0xb3, 0x13, 0xdd, 0x70, 0x5e, 0x65, 0xfc,
+ 0xa6, 0xb1, 0x13, 0x59, 0x29, 0xd0, 0x2e, 0xc4, 0x55, 0xcb, 0x99, 0xac,
+ 0xca, 0x48, 0x67, 0x3e, 0xfb, 0xfb, 0x54, 0xb7, 0x53, 0x32, 0xb4, 0x17,
+ 0xf6, 0x78, 0xd1, 0x64, 0x67, 0x76, 0x33, 0x3a, 0xe9, 0x13, 0x8c, 0x9c,
+ 0xf1, 0x74, 0xb7, 0xd1, 0x35, 0x41, 0xf2, 0x4d, 0x68, 0x53, 0x25, 0x57,
+ 0x97, 0x33, 0x18, 0xea, 0x96, 0xea, 0x66, 0x56, 0x82, 0xfe, 0xcf, 0x1a,
+ 0x2c, 0x8c, 0xee, 0xc6, 0x67, 0x5d, 0x22, 0x71, 0x93, 0x9e, 0x2e, 0x96,
+ 0xfa, 0x26, 0xa8, 0x3e, 0x49, 0xad, 0x0a, 0x64, 0xaa, 0xf2, 0xe6, 0x63,
+ 0x1d, 0x52, 0xfb, 0x67, 0x7e, 0x17, 0x91, 0x70, 0xef, 0x48, 0xe1, 0x2e,
+ 0x48, 0xe4, 0x8a, 0xc2, 0x4c, 0x5f, 0x77, 0x7f, 0x03, 0x45, 0xf0, 0x8d,
+ 0x44, 0xad, 0x1e, 0xef, 0xb5, 0x1f, 0x3c, 0x3c, 0x4e, 0x43, 0x87, 0xdd,
+ 0xec, 0xd9, 0x6e, 0xd0, 0xe8, 0x47, 0x75, 0x5b, 0xe5, 0xc0, 0x76, 0xb1,
+ 0x9c, 0x5b, 0x72, 0xeb, 0x15, 0x9c, 0x5a, 0xa1, 0x31, 0xc2, 0x46, 0xb4,
+ 0xe7, 0x9b, 0x5d, 0x86, 0x23, 0x3f, 0x47, 0xd9, 0x9b, 0x31, 0x4e, 0xa6,
+ 0x65, 0xe9, 0x2f, 0xa3, 0xf8, 0x34, 0x68, 0xf7, 0x61, 0xf5, 0x08, 0xc4,
+ 0x8a, 0x10, 0xa1, 0x9b, 0xa9, 0x30, 0x25, 0x8d, 0xaf, 0x67, 0x07, 0x8e,
+ 0x84, 0x62, 0xa5, 0xc3, 0x2f, 0x5d, 0x06, 0xaa, 0xd4, 0x02, 0x04, 0x77,
+ 0xed, 0xf4, 0xe0, 0xa9, 0xca, 0x95, 0xa2, 0x91, 0xe0, 0x56, 0x64, 0xb6,
+ 0xb8, 0x39, 0xda, 0x83, 0xc5, 0x10, 0x7e, 0xa6, 0x08, 0x10, 0x01, 0x15,
+ 0x2b, 0x6e, 0xce, 0xfe, 0x43, 0x01, 0xa9, 0xcb, 0xfd, 0xd9, 0x1b, 0x7e,
+ 0x11, 0x74, 0x96, 0x4a, 0x89, 0x3f, 0x07, 0xac, 0x74, 0xf9, 0x93, 0xb2,
+ 0xf6, 0xed, 0xb3, 0x29, 0xab, 0xc5, 0x0a, 0x90, 0xb3, 0x71, 0x51, 0xa5,
+ 0xba, 0x16, 0x01, 0xd4, 0x35, 0x11, 0xdc, 0xba, 0x27, 0xc3, 0x01, 0x05,
+ 0x65, 0x91, 0x6b, 0xff, 0x33, 0xb9, 0x9d, 0x84, 0xf7, 0xc0, 0x2d, 0x4b,
+ 0xf4, 0xb2, 0x39, 0xe4, 0x7d, 0x0f, 0xf6, 0x8d, 0xa4, 0x2c, 0xa2, 0x4d,
+ 0x4e, 0x8a, 0x2e, 0xff, 0x84, 0x5f, 0x43, 0x93, 0xa3, 0x43, 0xa2, 0xe3,
+ 0x23, 0x92, 0xf3, 0x57, 0xd2, 0x2e, 0x8e, 0xea, 0xff, 0x2c, 0x3d, 0x1f,
+ 0xc6, 0x94, 0x77, 0x19, 0xf6, 0xdb, 0x16, 0x4e, 0xd0, 0x3f, 0x32, 0xf3,
+ 0x7b, 0x89, 0x50, 0xc5, 0x5c, 0xfe, 0x86, 0xcf, 0xf6, 0x89, 0x88, 0xa3,
+ 0xa8, 0xd9, 0x52, 0x23, 0x68, 0x31, 0x90, 0xe2, 0xd4, 0x3a, 0x62, 0xb4,
+ 0xe6, 0x4e, 0xfa, 0x20, 0x21, 0xbf, 0xe5, 0x4e, 0x86, 0x6d, 0xbe, 0xbe,
+ 0xc6, 0x25, 0x4b, 0xf2, 0x20, 0x6c, 0x4e, 0xfc, 0x93, 0x41, 0x3f, 0x8b,
+ 0x29, 0x34, 0xb9, 0xd1, 0x61, 0xe0, 0x34, 0x83, 0x8e, 0x1f, 0x8c, 0x44,
+ 0xe2, 0x95, 0x2e, 0x73, 0x48, 0x8f, 0xeb, 0xd0, 0x6c, 0xec, 0xc4, 0xf6,
+ 0x48, 0x5e, 0xf7, 0x53, 0x3e, 0xa6, 0x77, 0x33, 0xb0, 0x9e, 0xf8, 0x05,
+ 0xa9, 0x7e, 0x96, 0x47, 0x3c, 0x8f, 0xa1, 0xfe, 0xd1, 0xb4, 0x85, 0x94,
+ 0x49, 0xa9, 0xd1, 0x45, 0xdf, 0xf0, 0x8b, 0xe8, 0x72, 0x74, 0x68, 0x74,
+ 0x5c, 0x67, 0xc2, 0xbb, 0xcd, 0x7b, 0x6a, 0x2f, 0x6b, 0x0a, 0x1d, 0xec,
+ 0x03, 0x48, 0xd2, 0x8e, 0xe3, 0x3e, 0xdb, 0x62, 0xc9, 0xda, 0x07, 0xe6,
+ 0x5e, 0x6f, 0x71, 0x2a, 0x18, 0xab, 0x9f, 0xd0, 0xd9, 0xfe, 0xd1, 0xac,
+ 0xf0, 0x21, 0xab, 0xd9, 0x70, 0x1e, 0xb9, 0x99, 0xa0, 0xcc, 0xeb, 0xe7,
+ 0x87, 0xee, 0xd9, 0x8e, 0xd0, 0xe5, 0xc0, 0x58, 0x75, 0x37, 0x3d, 0x03,
+ 0x4e, 0x18, 0x08, 0x27, 0xdd, 0x18, 0x38, 0x1b, 0xad, 0xf1, 0xd3, 0xcc,
+ 0xa1, 0x65, 0x26, 0x97, 0x3a, 0x2c, 0x3c, 0x06, 0x90, 0x71, 0xc3, 0xf1,
+ 0x88, 0x9c, 0x52, 0xa5, 0xce, 0x69, 0x11, 0xfd, 0x7a, 0x0d, 0x9d, 0x98,
+ 0x9e, 0xc9, 0x0b, 0xde, 0xea, 0x67, 0xd4, 0xce, 0xe6, 0x76, 0x13, 0xdf,
+ 0x00, 0xb5, 0x2f, 0xd2, 0xc8, 0xe7, 0x91, 0xf4, 0x3f, 0xda, 0x36, 0x90,
+ 0xb2, 0x89, 0x35, 0x3a, 0x28, 0xbb, 0xfe, 0x11, 0x7d, 0x0e, 0x4e, 0x8d,
+ 0x0e, 0x8b, 0x8c, 0xf8, 0x57, 0x79, 0xaf, 0x6d, 0x45, 0xed, 0x61, 0x43,
+ 0xbd, 0x80, 0x69, 0x1a, 0x51, 0xdc, 0x67, 0xdb, 0x6c, 0x59, 0x3b, 0x40,
+ 0xfc, 0xcb, 0xcd, 0xee, 0x25, 0x43, 0x15, 0x73, 0xfa, 0x1b, 0x3f, 0xda,
+ 0x35, 0x9e, 0x04, 0x35, 0x7b, 0x2e, 0x03, 0xd7, 0x33, 0x34, 0x19, 0x9d,
+ 0x7c, 0xf0, 0xfd, 0xdb, 0x31, 0xda, 0x1c, 0xb8, 0x0b, 0x0e, 0xa6, 0xe7,
+ 0xa0, 0x69, 0xc3, 0x01, 0x04, 0xfb, 0xa3, 0x07, 0x03, 0x75, 0xbe, 0x3a,
+ 0x79, 0x94, 0x2c, 0xa4, 0xd2, 0xe7, 0x45, 0x87, 0x80, 0xd2, 0x0e, 0x38,
+ 0x7e, 0x31, 0x13, 0x8a, 0x54, 0xb9, 0xcd, 0x22, 0x3f, 0xaf, 0x41, 0xb3,
+ 0xb3, 0x13, 0xd9, 0x21, 0x7b, 0xdd, 0x4c, 0xfa, 0x99, 0xdc, 0xce, 0xc2,
+ 0x7b, 0xe0, 0x16, 0xa5, 0xfa, 0x59, 0x1c, 0xf2, 0x3e, 0x87, 0xfb, 0x46,
+ 0xd2, 0x16, 0x51, 0x26, 0xa7, 0x45, 0x17, 0x7f, 0xc2, 0x2f, 0xa1, 0xc9,
+ 0xd1, 0xa1, 0xd1, 0x71, 0x9f, 0x0a, 0xef, 0x35, 0xed, 0xa8, 0xbd, 0xac,
+ 0x28, 0x77, 0xb0, 0x0d, 0x23, 0x4a, 0x3b, 0x8c, 0xfb, 0x6d, 0x8b, 0x27,
+ 0x68, 0x1f, 0x99, 0x79, 0xbd, 0xc4, 0xa8, 0x62, 0xae, 0x7f, 0x43, 0x67,
+ 0xfb, 0x46, 0xb3, 0xc0, 0x86, 0xaf, 0x65, 0xc0, 0x7a, 0xe6, 0x66, 0x83,
+ 0x33, 0xaf, 0x9e, 0x1f, 0xbb, 0x66, 0x3b, 0x43, 0x97, 0x01, 0x61, 0xd4,
+ 0xdc, 0xf4, 0x0d, 0x38, 0x60, 0x20, 0x9f, 0x74, 0x60, 0xe0, 0x6e, 0xb7,
+ 0xc7, 0x4f, 0x32, 0x85, 0x94, 0x9a, 0x5c, 0xe8, 0xb0, 0xf0, 0x1a, 0x41,
+ 0xc7, 0x0f, 0xc6, 0x22, 0x71, 0x4a, 0x97, 0x39, 0xa4, 0x47, 0xf5, 0xe8,
+ 0x36, 0x76, 0x62, 0x7b, 0x24, 0x2f, 0x7b, 0xa9, 0x9f, 0x53, 0x3b, 0x99,
+ 0xd8, 0x4f, 0x7c, 0x02, 0xd4, 0xbf, 0x4b, 0x23, 0x9e, 0x47, 0xd0, 0xff,
+ 0x68, 0xda, 0x42, 0xca, 0x24, 0xd4, 0xe8, 0xa2, 0xef, 0xf8, 0x45, 0xf4,
+ 0x39, 0x3a, 0x34, 0x3a, 0x2e, 0x33, 0xe1, 0x5d, 0xe6, 0xbd, 0xb5, 0x17,
+ 0xb5, 0x85, 0x0e, 0xf6, 0x01, 0xa4, 0x69, 0x47, 0x71, 0x9f, 0x6d, 0xb1,
+ 0x64, 0xed, 0x03, 0xf3, 0x2f, 0x37, 0xb8, 0x95, 0x0c, 0x55, 0xcf, 0xe8,
+ 0x6c, 0xff, 0x68, 0xd6, 0x78, 0x10, 0xd5, 0xec, 0xb8, 0x0f, 0x5c, 0xcc,
+ 0xd0, 0x66, 0x75, 0xf3, 0xc3, 0xf7, 0x6c, 0xc7, 0x68, 0x72, 0xe0, 0x2c,
+ 0x3a, 0x9b, 0x9e, 0x81, 0xa7, 0x0c, 0x04, 0x13, 0xee, 0x8c, 0x1c, 0x0d,
+ 0xd6, 0xf8, 0xe9, 0xe6, 0x50, 0xb2, 0x93, 0x4b, 0x9d, 0x16, 0x1e, 0x03,
+ 0x48, 0x38, 0xe1, 0xf8, 0xc4, 0x4e, 0x29, 0x52, 0xe7, 0x34, 0x88, 0xfe,
+ 0xbd, 0x06, 0xce, 0xcc, 0x4f, 0x64, 0x85, 0xef, 0x75, 0x33, 0xea, 0x67,
+ 0x73, 0x3b, 0x09, 0xef, 0x80, 0x5a, 0x97, 0xe9, 0x64, 0x73, 0xc8, 0xfa,
+ 0x1f, 0xed, 0x1b, 0x48, 0x59, 0x44, 0x9a, 0x9d, 0x14, 0x5d, 0xff, 0x08,
+ 0xbe, 0x87, 0x27, 0x46, 0x87, 0x45, 0xc6, 0x7c, 0x2b, 0xbc, 0xd7, 0xb6,
+ 0xa2, 0xf6, 0xb0, 0xa1, 0xde, 0xc0, 0x34, 0x8d, 0x28, 0xee, 0x33, 0xed,
+ 0xb6, 0x2c, 0x9d, 0xa0, 0x7e, 0x65, 0xe6, 0xf7, 0x12, 0xa1, 0x8a, 0xb9,
+ 0xfd, 0x0d, 0x9f, 0xed, 0x1a, 0xcf, 0x02, 0x1a, 0xbd, 0x97, 0x01, 0xeb,
+ 0x99, 0x9a, 0x0c, 0xce, 0xbe, 0x78, 0x7e, 0xed, 0x98, 0xed, 0x0e, 0x5c,
+ 0x05, 0x87, 0x53, 0x73, 0xd0, 0x34, 0xe1, 0x80, 0x82, 0x7d, 0xd1, 0x83,
+ 0x81, 0xba, 0xdf, 0x1d, 0x3c, 0xca, 0x16, 0x52, 0x69, 0x73, 0xa2, 0xc3,
+ 0xc0, 0x69, 0x07, 0x1c, 0x3f, 0x18, 0x89, 0xc5, 0x2a, 0x5c, 0xe6, 0x91,
+ 0x1f, 0xd7, 0xa0, 0xd9, 0xd9, 0x89, 0xec, 0x90, 0xbd, 0xee, 0xa6, 0x7d,
+ 0x4c, 0xee, 0x67, 0x61, 0x3d, 0xf0, 0x0b, 0x52, 0xfd, 0x2c, 0x8e, 0x79,
+ 0x1f, 0x43, 0xfd, 0xa3, 0x69, 0x0b, 0x28, 0x93, 0x53, 0xa2, 0x8b, 0xbf,
+ 0xe1, 0x17, 0xd0, 0xe4, 0xe8, 0xd0, 0xe8, 0xb8, 0xcf, 0x85, 0x77, 0x9a,
+ 0xf6, 0xd4, 0x5e, 0xd6, 0x14, 0x3b, 0xd8, 0x06, 0x91, 0xa5, 0x1d, 0xc6,
+ 0x7d, 0xb6, 0xc5, 0x93, 0xb4, 0x0f, 0xcc, 0xbc, 0xde, 0xe2, 0x54, 0x31,
+ 0x57, 0x3f, 0xa1, 0xb3, 0xfd, 0xa3, 0x59, 0xe0, 0x43, 0x57, 0xb2, 0xe0,
+ 0x3d, 0x73, 0x33, 0x41, 0x99, 0xd7, 0xcf, 0x0f, 0xdd, 0xb3, 0x1d, 0xa1,
+ 0xcb, 0x80, 0xb0, 0xea, 0x6e, 0x7a, 0x06, 0x9c, 0x30, 0x10, 0x4f, 0xba,
+ 0x30, 0x70, 0x37, 0x5b, 0xe3, 0xa7, 0x99, 0x42, 0xca, 0x4d, 0x2e, 0x74,
+ 0x58, 0x78, 0x0d, 0x20, 0xe3, 0x87, 0xe3, 0x11, 0x38, 0xa5, 0x4b, 0x9c,
+ 0xd2, 0x23, 0xfa, 0xf4, 0x1b, 0x3b, 0x31, 0x3d, 0x92, 0x17, 0xbd, 0xd4,
+ 0xcf, 0xa9, 0x9d, 0xcc, 0xec, 0x27, 0xbe, 0x01, 0x6a, 0x5f, 0xa5, 0x91,
+ 0xcf, 0x23, 0xe8, 0x7f, 0xb4, 0x6d, 0x21, 0x65, 0x12, 0x6a, 0x74, 0x51,
+ 0x77, 0xfc, 0x22, 0xfa, 0x1c, 0x9d, 0x1a, 0x1d, 0x17, 0x19, 0xf0, 0xae,
+ 0xf3, 0x5e, 0xda, 0x8b, 0xda, 0xc2, 0x87, 0x7b, 0x00, 0xd2, 0x34, 0xa3,
+ 0xb8, 0xcf, 0xb6, 0xd8, 0xb2, 0x76, 0x81, 0xf9, 0x97, 0x9b, 0xdc, 0x4a,
+ 0x86, 0x2a, 0xe7, 0xf4, 0x36, 0x7f, 0xb4, 0x6b, 0x3c, 0x08, 0x6a, 0xf6,
+ 0x5c, 0x07, 0xae, 0x66, 0x68, 0x33, 0x3a, 0xf9, 0xe1, 0xfb, 0xb6, 0x63,
+ 0xb4, 0x39, 0x70, 0x16, 0x1d, 0x4d, 0xcf, 0x40, 0xd3, 0x86, 0x02, 0x09,
+ 0xf7, 0x46, 0x0e, 0x06, 0xda, 0x64, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+ 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+ 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+ 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+ 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+ 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+ 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+ 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+ 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+ 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+ 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+ 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+ 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+ 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+ 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+ 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+ 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+ 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+ 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+ 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+ 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+ 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+ 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+ 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+ 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+ 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+ 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+ 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+ 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+ 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+ 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+ 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+ 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+ 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+ 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+ 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+ 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+ 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+ 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+ 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+ 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+ 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+ 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+ 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+ 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+ 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+ 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+ 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+ 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+ 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+ 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+ 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+ 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+ 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+ 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+ 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+ 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+ 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+ 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+ 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+ 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+ 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+ 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+ 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+ 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+ 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+ 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+ 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+ 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+ 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+ 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+ 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+ 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+ 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+ 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+ 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+ 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+ 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+ 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+ 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+ 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+ 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+ 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+ 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+ 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+ 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+ 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+ 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+ 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+ 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+ 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+ 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+ 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+ 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+ 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1d, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol10) == kNumBytesTestReadSymbol10, "");
+
+// The kBytesTestReadSymbol11[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][12] = {
+// // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+// { 32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+// 32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+// 32768 - 29789, 0, 0 },
+// // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+// { 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405, 32768 - 16384,
+// 32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+// 32768 - 31279, 0, 0 },
+// // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+// { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+// 32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+// 32768 - 28300, 0, 0 },
+// // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+// { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+// 32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+// 32768 - 31279, 0, 0 },
+// };
+// constexpr int kSymbols[22][4] = { { 0, 6, 10, 5 }, //
+// { 1, 7, 9, 4 }, //
+// { 2, 8, 8, 3 }, //
+// { 3, 9, 7, 2 }, //
+// { 4, 10, 6, 1 }, //
+// { 5, 0, 5, 0 }, //
+// { 6, 1, 4, 10 }, //
+// { 7, 2, 3, 9 }, //
+// { 8, 3, 2, 8 }, //
+// { 9, 4, 1, 7 }, //
+// { 10, 5, 0, 6 }, //
+// { 0, 0, 10, 9 }, //
+// { 2, 1, 8, 7 }, //
+// { 4, 3, 6, 5 }, //
+// { 6, 5, 4, 3 }, //
+// { 8, 7, 2, 1 }, //
+// { 10, 9, 10, 8 }, //
+// { 1, 0, 9, 6 }, //
+// { 3, 2, 7, 4 }, //
+// { 5, 4, 5, 2 }, //
+// { 7, 6, 3, 5 }, //
+// { 9, 8, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+// for (int j = 0; j < 22; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 11);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol11 = 3673;
+constexpr uint8_t kBytesTestReadSymbol11[] = {
+ 0x0f, 0xb4, 0x93, 0xdb, 0xbe, 0x10, 0xa5, 0x0b, 0xa6, 0x53, 0x86, 0x25,
+ 0xaf, 0x5e, 0xf9, 0xd6, 0x10, 0xd8, 0x5e, 0x2b, 0x6d, 0xf2, 0xf8, 0x35,
+ 0x97, 0xf6, 0x95, 0xeb, 0x67, 0x20, 0x49, 0x0e, 0x21, 0xb4, 0x73, 0x5e,
+ 0x72, 0x06, 0xdd, 0x76, 0x99, 0x3d, 0x67, 0x37, 0x27, 0xea, 0x21, 0x80,
+ 0xc6, 0xb8, 0xf7, 0x48, 0x5e, 0x11, 0xe2, 0xe7, 0x10, 0xad, 0x0b, 0x12,
+ 0x52, 0xd4, 0xe3, 0x63, 0x2a, 0x1d, 0x41, 0xf4, 0xce, 0x5d, 0x58, 0x5f,
+ 0x79, 0x6d, 0xdd, 0x4b, 0x3d, 0x99, 0xd9, 0x64, 0xdc, 0x08, 0x16, 0x1a,
+ 0xf3, 0x8f, 0x1e, 0x33, 0xfe, 0x7a, 0x49, 0xaa, 0x98, 0xb9, 0xe2, 0xc6,
+ 0x14, 0xb8, 0x51, 0x1f, 0x45, 0xce, 0xea, 0x97, 0xcd, 0xd0, 0x0b, 0x5d,
+ 0x12, 0x31, 0xbe, 0x78, 0x98, 0xa3, 0x77, 0x6a, 0xa0, 0xef, 0x57, 0x3a,
+ 0xc6, 0xe7, 0x52, 0x22, 0x06, 0x44, 0x35, 0x8e, 0xc9, 0xe8, 0x4f, 0x76,
+ 0xd9, 0x77, 0x8c, 0x80, 0xc9, 0xfc, 0x20, 0x0d, 0xc0, 0x67, 0x95, 0x21,
+ 0x93, 0x74, 0x4f, 0xf1, 0xf5, 0xdf, 0x5a, 0x10, 0xde, 0x57, 0xc8, 0x6e,
+ 0x33, 0x40, 0xae, 0x36, 0x4a, 0xc8, 0x49, 0xbf, 0x0d, 0x6d, 0x74, 0x34,
+ 0xff, 0xdc, 0x1b, 0xe3, 0xcf, 0xcf, 0xe6, 0xd1, 0xfb, 0x4d, 0xd5, 0x0e,
+ 0x86, 0x83, 0x21, 0x12, 0xf8, 0x51, 0x2a, 0xc4, 0x87, 0xd8, 0x1b, 0x1d,
+ 0xe7, 0x36, 0xb5, 0xc3, 0xf9, 0xf9, 0x8f, 0x0f, 0xc2, 0x21, 0x83, 0x75,
+ 0x14, 0x81, 0x17, 0xb1, 0x9b, 0x51, 0x56, 0x1d, 0xa1, 0xaa, 0xff, 0xd4,
+ 0x1f, 0xf3, 0x8d, 0xd1, 0x30, 0x53, 0x92, 0x69, 0xce, 0xf0, 0xc5, 0x75,
+ 0xcf, 0xd2, 0x6e, 0x37, 0x74, 0x79, 0xc3, 0x50, 0x52, 0x01, 0xc4, 0x0f,
+ 0x67, 0xe2, 0xb7, 0xe2, 0xf1, 0xcc, 0xd9, 0x49, 0xc4, 0x58, 0xbd, 0x8d,
+ 0x91, 0xb8, 0x35, 0xbd, 0x64, 0x12, 0x24, 0x20, 0x20, 0x29, 0x23, 0x94,
+ 0x85, 0xb6, 0xa8, 0x4e, 0xd4, 0x49, 0x09, 0x25, 0xc4, 0xc5, 0xa5, 0x0c,
+ 0x76, 0xa9, 0x4a, 0x75, 0x0f, 0xb9, 0x57, 0x33, 0xcd, 0xfd, 0xf8, 0x8f,
+ 0xae, 0x43, 0x48, 0xb8, 0xea, 0x87, 0x17, 0x0d, 0x3d, 0x8b, 0x9a, 0x21,
+ 0xe8, 0xbf, 0xc8, 0x5e, 0x18, 0x48, 0xa3, 0xcd, 0x08, 0x59, 0x9b, 0xdb,
+ 0x79, 0x5c, 0xe9, 0xa3, 0xe6, 0xba, 0x58, 0x53, 0x10, 0x9a, 0x2c, 0x2b,
+ 0x10, 0x5b, 0x96, 0x9a, 0x1f, 0x8f, 0xc2, 0x7d, 0xee, 0xe9, 0xc2, 0xbc,
+ 0x8f, 0x8b, 0xa7, 0x41, 0xb1, 0x33, 0x58, 0x6e, 0x25, 0x13, 0x3a, 0xd0,
+ 0x78, 0x53, 0xda, 0xa2, 0x35, 0x23, 0x89, 0x39, 0xa7, 0xef, 0x94, 0xda,
+ 0x2f, 0xc3, 0x17, 0x80, 0x27, 0xc7, 0x0f, 0xda, 0xfb, 0xda, 0x64, 0x3c,
+ 0x94, 0x8c, 0x39, 0xd0, 0x06, 0x62, 0x6c, 0x0d, 0x26, 0xba, 0x4f, 0xcb,
+ 0x8a, 0xa0, 0xbc, 0xeb, 0x3f, 0x65, 0x51, 0x8e, 0x1d, 0x2e, 0x9e, 0x5f,
+ 0xe3, 0x15, 0x0e, 0x58, 0x4f, 0xb7, 0xb6, 0x64, 0x95, 0xe8, 0x0e, 0x00,
+ 0x7c, 0x1e, 0xd9, 0xde, 0x35, 0x5a, 0xff, 0xd5, 0xe5, 0xb3, 0x64, 0xcc,
+ 0x8b, 0x93, 0xbc, 0x2a, 0x25, 0x7d, 0x50, 0x92, 0x3e, 0x23, 0x4c, 0x07,
+ 0x5e, 0xcf, 0xbb, 0x52, 0xd0, 0xc4, 0xd9, 0x77, 0x66, 0x01, 0x57, 0x1f,
+ 0xa0, 0x9d, 0xb2, 0x6d, 0x4e, 0x36, 0xc1, 0x9a, 0x70, 0x4e, 0xa3, 0x5f,
+ 0xf6, 0xf9, 0x50, 0x08, 0xcd, 0xf9, 0xe5, 0x76, 0x81, 0xea, 0x88, 0x2e,
+ 0xf5, 0x2a, 0xd4, 0x31, 0x39, 0x8d, 0xfe, 0x1c, 0x15, 0x1d, 0x41, 0x2b,
+ 0x55, 0xc7, 0xe8, 0x27, 0x6f, 0xc3, 0xf0, 0x23, 0x76, 0x9a, 0xb2, 0x87,
+ 0x0c, 0x71, 0x3c, 0x73, 0xea, 0x20, 0x93, 0xf4, 0x21, 0x56, 0xfb, 0x8e,
+ 0xd7, 0xaf, 0xc3, 0xd4, 0xf4, 0x31, 0x6f, 0xe8, 0x1f, 0x5b, 0x83, 0xa9,
+ 0x2b, 0x83, 0x08, 0x2e, 0xa2, 0xf3, 0x6c, 0x06, 0xe5, 0x89, 0x73, 0x73,
+ 0x98, 0x0e, 0x57, 0x07, 0x49, 0x68, 0xa4, 0xb2, 0x4a, 0x26, 0xd1, 0x91,
+ 0x49, 0x87, 0x05, 0x55, 0xa4, 0x88, 0x7d, 0x3d, 0x57, 0x7c, 0x20, 0x8c,
+ 0x2c, 0xea, 0x30, 0x63, 0x3a, 0xe4, 0xab, 0x27, 0x80, 0xab, 0xfb, 0x22,
+ 0x8a, 0x0f, 0xe0, 0xe9, 0xc5, 0xd5, 0x4f, 0x8a, 0x2c, 0x28, 0x36, 0x63,
+ 0xbd, 0xa3, 0xc4, 0x90, 0xe4, 0x9e, 0x98, 0xca, 0xce, 0xfc, 0x96, 0xb8,
+ 0x22, 0x0d, 0x17, 0xc8, 0xad, 0xc7, 0x01, 0x38, 0x6e, 0x95, 0x30, 0x74,
+ 0xda, 0xb8, 0xa9, 0xa8, 0xe6, 0xf2, 0x03, 0x41, 0xb2, 0x05, 0x37, 0x04,
+ 0x8b, 0x51, 0xf9, 0xeb, 0x97, 0xdf, 0xe9, 0xa8, 0x5f, 0x11, 0x2f, 0x9f,
+ 0x4f, 0xbe, 0xc1, 0x53, 0x2c, 0x75, 0x90, 0xca, 0xa3, 0x9b, 0xc1, 0x36,
+ 0xa3, 0x03, 0x65, 0xab, 0x57, 0xc4, 0x0e, 0x8a, 0x41, 0xfc, 0x60, 0x65,
+ 0x13, 0x87, 0x6d, 0xda, 0x00, 0xad, 0x56, 0x1c, 0x28, 0x7c, 0x4c, 0xa2,
+ 0x92, 0xda, 0x23, 0x00, 0xe8, 0x60, 0x20, 0x59, 0x45, 0x4a, 0x26, 0xae,
+ 0x22, 0x37, 0x7c, 0x14, 0xce, 0xff, 0x0d, 0xa9, 0xef, 0xfc, 0x93, 0xbd,
+ 0xde, 0x2b, 0x0f, 0xc7, 0xc0, 0x8a, 0x90, 0x06, 0xec, 0x53, 0x9f, 0xc8,
+ 0x5b, 0x7b, 0xe8, 0x38, 0x22, 0x75, 0xe9, 0x40, 0xbc, 0x62, 0xe9, 0x9d,
+ 0x49, 0xab, 0x88, 0x8d, 0xdf, 0x05, 0x33, 0xbf, 0xc3, 0x69, 0x6c, 0x36,
+ 0x71, 0x17, 0x70, 0xc1, 0xe0, 0xd1, 0x71, 0xcf, 0xd5, 0x48, 0x83, 0x50,
+ 0x74, 0x07, 0xc4, 0xca, 0x29, 0x2d, 0xa2, 0x30, 0x0e, 0x86, 0x02, 0x05,
+ 0x94, 0x54, 0xa2, 0x6a, 0xe2, 0x23, 0x77, 0xc1, 0x4c, 0xef, 0xa4, 0x8c,
+ 0xbe, 0x6b, 0x0f, 0x7c, 0x05, 0x30, 0x78, 0x34, 0x5c, 0x73, 0xf5, 0x52,
+ 0x20, 0xd4, 0x1d, 0x01, 0xca, 0x9f, 0x89, 0x3b, 0x91, 0x1d, 0x1f, 0x27,
+ 0xe1, 0xf9, 0xe8, 0xd0, 0xb2, 0x56, 0x32, 0x15, 0x37, 0xa3, 0x08, 0x38,
+ 0xb7, 0x57, 0xb4, 0x09, 0xfe, 0xf4, 0x72, 0xe1, 0x8f, 0x4b, 0x6b, 0x00,
+ 0x8c, 0xc5, 0x39, 0xd5, 0x45, 0x45, 0xbb, 0xf6, 0xb7, 0x01, 0xde, 0xef,
+ 0x8b, 0xaf, 0x85, 0x73, 0xc4, 0x93, 0x3f, 0xbe, 0xf8, 0x69, 0xbd, 0x71,
+ 0xa9, 0x65, 0x6f, 0x22, 0xa6, 0xca, 0x36, 0xf0, 0x34, 0x1b, 0x20, 0x24,
+ 0x6c, 0xd2, 0xe3, 0xbb, 0xb5, 0x80, 0xfc, 0xc4, 0x90, 0x54, 0x70, 0xab,
+ 0xb7, 0xb9, 0xdb, 0xeb, 0x3b, 0x1d, 0x75, 0xc8, 0x82, 0x9a, 0x15, 0x8a,
+ 0x88, 0xb0, 0x7a, 0x77, 0xcf, 0xdc, 0x96, 0x22, 0x4d, 0x08, 0x47, 0x9a,
+ 0x06, 0x3e, 0x47, 0xb1, 0x54, 0xdf, 0x22, 0x9d, 0x75, 0x8f, 0xdb, 0xc4,
+ 0x5a, 0xd0, 0xfe, 0x44, 0xc4, 0xce, 0x9a, 0x57, 0x0b, 0x20, 0x36, 0x07,
+ 0xb1, 0xcf, 0xfe, 0xb4, 0x3e, 0x03, 0x1b, 0x5d, 0xac, 0x40, 0x54, 0x88,
+ 0x52, 0x2e, 0x81, 0x8f, 0x3c, 0x52, 0x87, 0x68, 0x00, 0xa5, 0x95, 0xbc,
+ 0xd9, 0x67, 0x87, 0xa0, 0x75, 0x78, 0xb6, 0xa9, 0xda, 0x76, 0x9d, 0xe4,
+ 0x5a, 0x6d, 0xd5, 0x78, 0xcd, 0x7b, 0x26, 0x5f, 0xc0, 0x09, 0xab, 0x25,
+ 0x16, 0x38, 0xa1, 0x86, 0xa7, 0x5e, 0x5e, 0x2d, 0x3e, 0x2f, 0x09, 0xdc,
+ 0x31, 0x4d, 0x71, 0x2e, 0xec, 0x5f, 0xa0, 0xe0, 0x8f, 0x9c, 0xcd, 0x72,
+ 0xc8, 0x05, 0xa3, 0xb0, 0xfc, 0x4c, 0xdb, 0x6b, 0x24, 0xf2, 0x92, 0x6b,
+ 0x13, 0x79, 0x1c, 0x36, 0x90, 0x20, 0x71, 0xaa, 0x8c, 0x1c, 0xe4, 0xbf,
+ 0x54, 0xf8, 0x48, 0x51, 0xd2, 0x9a, 0x23, 0xa0, 0x55, 0x38, 0x24, 0x17,
+ 0x39, 0x89, 0x4f, 0xc9, 0x01, 0x77, 0x05, 0x16, 0x97, 0x3e, 0xac, 0x9f,
+ 0xba, 0x4a, 0xb1, 0x7e, 0x47, 0x0d, 0xa4, 0x08, 0x1c, 0x6a, 0xa3, 0x07,
+ 0x39, 0x2f, 0xd5, 0x3e, 0x12, 0x14, 0x74, 0xa6, 0x88, 0xe8, 0x15, 0x4e,
+ 0x09, 0x05, 0xce, 0x62, 0x53, 0xf2, 0x40, 0x7b, 0x49, 0x58, 0xc8, 0x5d,
+ 0x29, 0x54, 0xb1, 0xfd, 0xb0, 0xb2, 0x75, 0x2c, 0x55, 0x9f, 0xf9, 0x57,
+ 0x58, 0xec, 0xfb, 0xff, 0xa3, 0xa0, 0x27, 0x02, 0x0e, 0xa7, 0x52, 0xe7,
+ 0x9e, 0xbd, 0xb6, 0x1d, 0xe6, 0x7e, 0xa2, 0xc0, 0x95, 0xe1, 0x4d, 0xd5,
+ 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67,
+ 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xe8, 0xe8, 0x09, 0xc0, 0x83, 0xa9,
+ 0xd4, 0xb9, 0xe7, 0xaf, 0x6d, 0x87, 0x79, 0x9f, 0xa8, 0xb0, 0x25, 0x78,
+ 0x92, 0x0e, 0x9d, 0xf7, 0x55, 0xd9, 0x1a, 0xc5, 0x48, 0x6c, 0xbe, 0x66,
+ 0xb0, 0xf7, 0xbf, 0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xfa, 0x3a, 0x02, 0x70,
+ 0x20, 0xde, 0xb0, 0xe4, 0xe4, 0x0e, 0x59, 0x44, 0x11, 0x28, 0xe1, 0x22,
+ 0xe8, 0x0e, 0x5b, 0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31,
+ 0xb9, 0x4a, 0x90, 0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xcf, 0x9c,
+ 0x99, 0xd5, 0x85, 0x99, 0x5e, 0x0a, 0x51, 0x8d, 0x0d, 0x77, 0x3c, 0x51,
+ 0xe1, 0x98, 0x1c, 0x5a, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb,
+ 0xa1, 0x0f, 0x38, 0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6,
+ 0xb7, 0x4f, 0x5e, 0xbd, 0x3f, 0xbc, 0xb6, 0xc6, 0x7b, 0xae, 0x23, 0x97,
+ 0xc7, 0xcb, 0xa7, 0x98, 0x37, 0xf4, 0xd6, 0x0c, 0x12, 0xd6, 0xad, 0xc7,
+ 0x51, 0xb3, 0x0e, 0x88, 0x40, 0xfd, 0xf7, 0x1b, 0x29, 0xcf, 0xb8, 0x7c,
+ 0x29, 0xa1, 0xa2, 0x72, 0x05, 0xa1, 0x0f, 0x43, 0xa8, 0xc4, 0x24, 0x49,
+ 0x96, 0xbf, 0x56, 0xe4, 0xbf, 0xc7, 0x71, 0x5a, 0x18, 0x85, 0x65, 0xdd,
+ 0x17, 0x95, 0x30, 0x18, 0x8b, 0x18, 0xd2, 0xb2, 0x3f, 0x2e, 0xe9, 0x69,
+ 0x89, 0x90, 0xe0, 0x24, 0x08, 0x13, 0x23, 0x0a, 0x78, 0x59, 0x1e, 0xe6,
+ 0x33, 0x0f, 0x12, 0x73, 0xba, 0xb3, 0x3c, 0x1d, 0x05, 0x71, 0x7a, 0xd7,
+ 0x87, 0xd3, 0xaa, 0x7c, 0xb9, 0x3f, 0x74, 0x95, 0x62, 0xfc, 0x85, 0xac,
+ 0xe0, 0xe9, 0xaa, 0x6f, 0x48, 0x4b, 0xdf, 0xb6, 0x9a, 0x7c, 0x24, 0x28,
+ 0xe3, 0x6e, 0x40, 0xbd, 0x03, 0xab, 0xc5, 0xb5, 0x4e, 0xd3, 0xb4, 0xef,
+ 0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41, 0x6f, 0xf8, 0x5f, 0x41, 0xa5,
+ 0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1, 0xd9, 0xf7, 0xff, 0xf0, 0x29,
+ 0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6, 0x1a, 0xec, 0xa3, 0x4e, 0x59,
+ 0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28, 0xd6, 0x43, 0x53, 0x6c, 0x6e,
+ 0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf, 0x15, 0x1b, 0x6b, 0x46, 0x3a,
+ 0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8, 0x3e, 0xe0, 0xc4, 0xb8, 0xf8,
+ 0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d, 0x47, 0x12, 0x68, 0x9a, 0xf6,
+ 0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf, 0xe5, 0xb2, 0x7f, 0x4f, 0xb9,
+ 0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7, 0xc0, 0x81, 0x63, 0xff, 0x84,
+ 0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b, 0xfb, 0x08, 0xd3, 0x09, 0xa8,
+ 0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79, 0x0d, 0x74, 0xce, 0xe9, 0xc6,
+ 0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2, 0x3d, 0x76, 0x94, 0x72, 0xed,
+ 0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b, 0x1d, 0x75, 0xc8, 0xfd, 0x9a,
+ 0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59, 0xf1, 0xbe, 0x8f, 0xe4, 0x3d,
+ 0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc, 0x86, 0x2f, 0x9d, 0x75, 0x8c,
+ 0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda, 0xa7, 0x69, 0xda, 0x77, 0x91,
+ 0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7, 0xfc, 0x2f, 0xa0, 0xd2, 0x9d,
+ 0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec, 0xfb, 0xff, 0xf8, 0x14, 0xef,
+ 0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d, 0x76, 0x51, 0xa7, 0x2c, 0xa5,
+ 0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b, 0x21, 0xa9, 0xb6, 0x37, 0x53,
+ 0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a, 0x8d, 0xb5, 0xa3, 0x1d, 0x12,
+ 0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f, 0x70, 0x62, 0x5c, 0x7c, 0x4b,
+ 0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3, 0x89, 0x34, 0x4d, 0x7b, 0x6b,
+ 0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2, 0xd9, 0x3f, 0xa7, 0xdc, 0xaf,
+ 0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0, 0x40, 0xb1, 0xff, 0xc2, 0x18,
+ 0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd, 0x84, 0x69, 0x84, 0xd4, 0x49,
+ 0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86, 0xba, 0x67, 0x74, 0xe3, 0x41,
+ 0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e, 0xbb, 0x4a, 0x39, 0x76, 0xd9,
+ 0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e, 0xba, 0xe4, 0x7e, 0xcd, 0x0a,
+ 0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8, 0xdf, 0x47, 0xf2, 0x1e, 0xee,
+ 0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43, 0x17, 0xce, 0xba, 0xc6, 0x74,
+ 0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53, 0xb4, 0xed, 0x3b, 0xc8, 0xc7,
+ 0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe, 0x17, 0xd0, 0x69, 0x4e, 0xa6,
+ 0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d, 0xff, 0xfc, 0x0a, 0x77, 0xc0,
+ 0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb, 0x28, 0xd3, 0x96, 0x52, 0xf3,
+ 0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90, 0xd4, 0xdb, 0x1b, 0xa9, 0x8d,
+ 0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46, 0xda, 0xd1, 0x8e, 0x89, 0x64,
+ 0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8, 0x31, 0x2e, 0x3e, 0x25, 0xa3,
+ 0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4, 0x9a, 0x26, 0xbd, 0xb5, 0xdd,
+ 0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c, 0x9f, 0xd3, 0xee, 0x57, 0xb5,
+ 0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20, 0x58, 0xff, 0xe1, 0x0c, 0x19,
+ 0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2, 0x34, 0xc2, 0x6a, 0x24, 0xc4,
+ 0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d, 0x33, 0xba, 0x71, 0xa0, 0xf3,
+ 0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d, 0xa5, 0x1c, 0xbb, 0x6c, 0xd9,
+ 0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d, 0x72, 0x3f, 0x66, 0x85, 0x62,
+ 0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f, 0xa3, 0xf9, 0x0f, 0x77, 0x5c,
+ 0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b, 0xe7, 0x5d, 0x63, 0x3a, 0x6f,
+ 0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda, 0x76, 0x9d, 0xe4, 0x63, 0xcd,
+ 0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15,
+ 0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xfe, 0x05, 0x3b, 0xe0, 0x3d,
+ 0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94, 0x69, 0xcb, 0x29, 0x79, 0xa0,
+ 0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a, 0x6d, 0x8d, 0xd4, 0xc6, 0xe5,
+ 0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d, 0x68, 0xc7, 0x44, 0xb2, 0x6b,
+ 0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18, 0x97, 0x1f, 0x12, 0xd1, 0xdb,
+ 0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d, 0x13, 0x5e, 0xda, 0xee, 0x84,
+ 0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f, 0xe9, 0xf7, 0x2b, 0xda, 0xde,
+ 0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c, 0x7f, 0xf0, 0x86, 0x0c, 0xe8,
+ 0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a, 0x61, 0x35, 0x12, 0x62, 0x3e,
+ 0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99, 0xdd, 0x38, 0xd0, 0x79, 0xbc,
+ 0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2, 0x8e, 0x5d, 0xb6, 0x6c, 0xd3,
+ 0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9, 0x1f, 0xb3, 0x42, 0xb1, 0x4f,
+ 0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1, 0xfc, 0x87, 0xbb, 0xae, 0x53,
+ 0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3, 0xae, 0xb1, 0x9d, 0x37, 0xfa,
+ 0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b, 0x4e, 0xf2, 0x31, 0xe6, 0xea,
+ 0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4, 0x1a, 0x53, 0xa9, 0x8a, 0xb3,
+ 0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff, 0x02, 0x9d, 0xf0, 0x1e, 0xde,
+ 0x9a, 0x34, 0x9c, 0x61, 0xae, 0xca, 0x34, 0xe5, 0x94, 0xbc, 0xd0, 0x1c,
+ 0xb6, 0xc4, 0xd2, 0x8d, 0x64, 0x35, 0x36, 0xc6, 0xea, 0x63, 0x72, 0x95,
+ 0x21, 0x1a, 0x5d, 0xf1, 0x51, 0xb6, 0xb4, 0x63, 0xa2, 0x59, 0x35, 0xc7,
+ 0x6d, 0xc1, 0x2b, 0x83, 0xee, 0x0c, 0x4b, 0x8f, 0x89, 0x68, 0xed, 0xe4,
+ 0x9f, 0xf5, 0x83, 0xd4, 0x71, 0x26, 0x89, 0xaf, 0x6d, 0x77, 0x42, 0x1e,
+ 0x70, 0xea, 0xbd, 0xfe, 0x5b, 0x27, 0xf4, 0xfb, 0x95, 0xed, 0x6f, 0x77,
+ 0xac, 0x87, 0xed, 0x7c, 0x08, 0x16, 0x3f, 0xf8, 0x43, 0x06, 0x74, 0x09,
+ 0x5c, 0xb0, 0x36, 0xbf, 0xb0, 0x8d, 0x30, 0x9a, 0x89, 0x31, 0x1f, 0x7f,
+ 0x36, 0x88, 0x97, 0x90, 0xd7, 0x4c, 0xee, 0x9c, 0x68, 0x3c, 0xde, 0x05,
+ 0x45, 0x1f, 0xfe, 0x23, 0xd7, 0x69, 0x47, 0x2e, 0xdb, 0x36, 0x69, 0x89,
+ 0x7d, 0x90, 0xb3, 0xb1, 0xd7, 0x5c, 0x8f, 0xd9, 0xa1, 0x58, 0xa7, 0xce,
+ 0x9b, 0x68, 0xe5, 0x9f, 0x1b, 0xe8, 0xfe, 0x43, 0xdd, 0xd7, 0x29, 0x87,
+ 0x1e, 0x5e, 0xfd, 0xc8, 0x62, 0xf9, 0xd7, 0x58, 0xce, 0x9b, 0xfd, 0x18,
+ 0x9a, 0xe4, 0x4d, 0xaa, 0x76, 0x9d, 0xa7, 0x79, 0x18, 0xf3, 0x75, 0x5e,
+ 0x33, 0x82, 0x0b, 0x7f, 0xc2, 0xfa, 0x0d, 0x29, 0xd4, 0xc5, 0x59, 0xff,
+ 0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xff, 0x81, 0x4e, 0xf8, 0x0f, 0x6f, 0x4d,
+ 0x1a, 0x4e, 0x30, 0xd7, 0x65, 0x1a, 0x72, 0xca, 0x5e, 0x68, 0x0e, 0x5b,
+ 0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31, 0xb9, 0x4a, 0x90,
+ 0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xd1, 0x2c, 0x9a, 0xe3, 0xb6,
+ 0xe0, 0x95, 0xc1, 0xf7, 0x06, 0x25, 0xc7, 0xc4, 0xb4, 0x76, 0xf2, 0x4f,
+ 0xfa, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb, 0xa1, 0x0f, 0x38,
+ 0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6, 0xb7, 0xbb, 0xd6,
+ 0x43, 0xf6, 0xbe, 0x04, 0x0b, 0x1f, 0xfc, 0x21, 0x83, 0x3a, 0x04, 0xae,
+ 0x58, 0x1b, 0x5f, 0xd8, 0x46, 0x98, 0x4d, 0x44, 0x98, 0x8f, 0xbf, 0x9b,
+ 0x44, 0x4b, 0xc8, 0x6b, 0xa6, 0x77, 0x4e, 0x34, 0x1e, 0x6f, 0x02, 0xa2,
+ 0x8f, 0xff, 0x11, 0xeb, 0xb4, 0xa3, 0x97, 0x6d, 0x9b, 0x34, 0xc4, 0xbe,
+ 0xc8, 0x59, 0xd8, 0xeb, 0xae, 0x47, 0xec, 0xd0, 0xac, 0x53, 0xe7, 0x4d,
+ 0xb4, 0x72, 0xcf, 0x8d, 0xf4, 0x7f, 0x21, 0xee, 0xeb, 0x94, 0xc3, 0x8f,
+ 0x2f, 0x7e, 0xe4, 0x31, 0x7c, 0xeb, 0xac, 0x67, 0x4d, 0xfe, 0x8c, 0x4d,
+ 0x72, 0x26, 0xd5, 0x3b, 0x4e, 0xd3, 0xbc, 0x8c, 0x79, 0xba, 0xaf, 0x19,
+ 0xc1, 0x05, 0xbf, 0xe1, 0x7d, 0x06, 0x94, 0xea, 0x62, 0xac, 0xff, 0xca,
+ 0xba, 0xc7, 0x67, 0xdf, 0xff, 0xc0, 0xa7, 0x7c, 0x07, 0xb7, 0xa6, 0x8d,
+ 0x27, 0x18, 0x6b, 0xb2, 0x8d, 0x39, 0x65, 0x2f, 0x34, 0x07, 0x2d, 0xb1,
+ 0x34, 0xa3, 0x59, 0x0d, 0x4d, 0xb1, 0xba, 0x98, 0xdc, 0xa5, 0x48, 0x46,
+ 0x97, 0x7c, 0x54, 0x6d, 0xad, 0x18, 0xe8, 0x96, 0x4d, 0x71, 0xdb, 0x70,
+ 0x4a, 0xe0, 0xfb, 0x83, 0x12, 0xe3, 0xe2, 0x5a, 0x3b, 0x79, 0x27, 0xfd,
+ 0x60, 0xf5, 0x1c, 0x49, 0xa2, 0x6b, 0xdb, 0x5d, 0xd0, 0x87, 0x9c, 0x3a,
+ 0xaf, 0x7f, 0x96, 0xc9, 0xfd, 0x3e, 0xe5, 0x7b, 0x5b, 0xdd, 0xeb, 0x21,
+ 0xfb, 0x5f, 0x02, 0x05, 0x8f, 0xfe, 0x10, 0xc1, 0x9d, 0x02, 0x57, 0x2c,
+ 0x0d, 0xaf, 0xec, 0x23, 0x4c, 0x26, 0xa2, 0x4c, 0x47, 0xdf, 0xcd, 0xa2,
+ 0x25, 0xe4, 0x35, 0xd3, 0x3b, 0xa7, 0x1a, 0x0f, 0x37, 0x81, 0x51, 0x47,
+ 0xff, 0x88, 0xf5, 0xda, 0x51, 0xcb, 0xb6, 0xcd, 0x9a, 0x62, 0x5f, 0x64,
+ 0x2c, 0xec, 0x75, 0xd7, 0x23, 0xf6, 0x68, 0x56, 0x29, 0xf3, 0xa6, 0xda,
+ 0x39, 0x67, 0xc6, 0xfa, 0x3f, 0x90, 0xf7, 0x75, 0xca, 0x61, 0xc7, 0x97,
+ 0xbf, 0x72, 0x18, 0xbe, 0x75, 0xd6, 0x33, 0xa6, 0xff, 0x46, 0x26, 0xb9,
+ 0x13, 0x6a, 0x9d, 0xa7, 0x69, 0xde, 0x46, 0x3c, 0xdd, 0x57, 0x8c, 0xe0,
+ 0x82, 0xdf, 0xf0, 0xbe, 0x83, 0x4a, 0x75, 0x31, 0x56, 0x7f, 0xe5, 0x5d,
+ 0x63, 0xb3, 0xef, 0xff, 0xe0, 0x53, 0xbe, 0x03, 0xdb, 0xd3, 0x46, 0x93,
+ 0x8c, 0x35, 0xd9, 0x46, 0x9c, 0xb2, 0x97, 0x9a, 0x03, 0x96, 0xd8, 0x9a,
+ 0x51, 0xac, 0x86, 0xa6, 0xd8, 0xdd, 0x4c, 0x6e, 0x52, 0xa4, 0x23, 0x4b,
+ 0xbe, 0x2a, 0x36, 0xd6, 0x8c, 0x74, 0x4b, 0x26, 0xb8, 0xed, 0xb8, 0x25,
+ 0x70, 0x7d, 0xc1, 0x89, 0x71, 0xf1, 0x2d, 0x1d, 0xbc, 0x93, 0xfe, 0xb0,
+ 0x7a, 0x8e, 0x24, 0xd1, 0x35, 0xed, 0xae, 0xe8, 0x43, 0xce, 0x1d, 0x57,
+ 0xbf, 0xcb, 0x64, 0xfe, 0x9f, 0x72, 0xbd, 0xad, 0xee, 0xf5, 0x90, 0xfd,
+ 0xaf, 0x81, 0x02, 0xc7, 0xff, 0x08, 0x60, 0xce, 0x81, 0x2b, 0x96, 0x06,
+ 0xd7, 0xf6, 0x11, 0xa6, 0x13, 0x51, 0x26, 0x23, 0xef, 0xe6, 0xd1, 0x12,
+ 0xf2, 0x1a, 0xe9, 0x9d, 0xd3, 0x8d, 0x07, 0x9b, 0xc0, 0xa8, 0xa3, 0xff,
+ 0xc4, 0x7a, 0xed, 0x28, 0xe5, 0xdb, 0x66, 0xcd, 0x31, 0x2f, 0xb2, 0x16,
+ 0x76, 0x3a, 0xeb, 0x91, 0xfb, 0x34, 0x2b, 0x14, 0xf9, 0xd3, 0x6d, 0x1c,
+ 0xb3, 0xe3, 0x7d, 0x1f, 0xc8, 0x7b, 0xba, 0xe5, 0x30, 0xe3, 0xcb, 0xdf,
+ 0xb9, 0x0c, 0x5f, 0x3a, 0xeb, 0x19, 0xd3, 0x7f, 0xa3, 0x13, 0x5c, 0x89,
+ 0xb5, 0x4e, 0xd3, 0xb4, 0xef, 0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41,
+ 0x6f, 0xf8, 0x5f, 0x41, 0xa5, 0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1,
+ 0xd9, 0xf7, 0xff, 0xf0, 0x29, 0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6,
+ 0x1a, 0xec, 0xa3, 0x4e, 0x59, 0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28,
+ 0xd6, 0x43, 0x53, 0x6c, 0x6e, 0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf,
+ 0x15, 0x1b, 0x6b, 0x46, 0x3a, 0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8,
+ 0x3e, 0xe0, 0xc4, 0xb8, 0xf8, 0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d,
+ 0x47, 0x12, 0x68, 0x9a, 0xf6, 0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf,
+ 0xe5, 0xb2, 0x7f, 0x4f, 0xb9, 0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7,
+ 0xc0, 0x81, 0x63, 0xff, 0x84, 0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b,
+ 0xfb, 0x08, 0xd3, 0x09, 0xa8, 0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79,
+ 0x0d, 0x74, 0xce, 0xe9, 0xc6, 0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2,
+ 0x3d, 0x76, 0x94, 0x72, 0xed, 0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b,
+ 0x1d, 0x75, 0xc8, 0xfd, 0x9a, 0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59,
+ 0xf1, 0xbe, 0x8f, 0xe4, 0x3d, 0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc,
+ 0x86, 0x2f, 0x9d, 0x75, 0x8c, 0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda,
+ 0xa7, 0x69, 0xda, 0x77, 0x91, 0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7,
+ 0xfc, 0x2f, 0xa0, 0xd2, 0x9d, 0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec,
+ 0xfb, 0xff, 0xf8, 0x14, 0xef, 0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d,
+ 0x76, 0x51, 0xa7, 0x2c, 0xa5, 0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b,
+ 0x21, 0xa9, 0xb6, 0x37, 0x53, 0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a,
+ 0x8d, 0xb5, 0xa3, 0x1d, 0x12, 0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f,
+ 0x70, 0x62, 0x5c, 0x7c, 0x4b, 0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3,
+ 0x89, 0x34, 0x4d, 0x7b, 0x6b, 0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2,
+ 0xd9, 0x3f, 0xa7, 0xdc, 0xaf, 0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0,
+ 0x40, 0xb1, 0xff, 0xc2, 0x18, 0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd,
+ 0x84, 0x69, 0x84, 0xd4, 0x49, 0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86,
+ 0xba, 0x67, 0x74, 0xe3, 0x41, 0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e,
+ 0xbb, 0x4a, 0x39, 0x76, 0xd9, 0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e,
+ 0xba, 0xe4, 0x7e, 0xcd, 0x0a, 0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8,
+ 0xdf, 0x47, 0xf2, 0x1e, 0xee, 0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43,
+ 0x17, 0xce, 0xba, 0xc6, 0x74, 0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53,
+ 0xb4, 0xed, 0x3b, 0xc8, 0xc7, 0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe,
+ 0x17, 0xd0, 0x69, 0x4e, 0xa6, 0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d,
+ 0xff, 0xfc, 0x0a, 0x77, 0xc0, 0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb,
+ 0x28, 0xd3, 0x96, 0x52, 0xf3, 0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90,
+ 0xd4, 0xdb, 0x1b, 0xa9, 0x8d, 0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46,
+ 0xda, 0xd1, 0x8e, 0x89, 0x64, 0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8,
+ 0x31, 0x2e, 0x3e, 0x25, 0xa3, 0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4,
+ 0x9a, 0x26, 0xbd, 0xb5, 0xdd, 0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c,
+ 0x9f, 0xd3, 0xee, 0x57, 0xb5, 0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20,
+ 0x58, 0xff, 0xe1, 0x0c, 0x19, 0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2,
+ 0x34, 0xc2, 0x6a, 0x24, 0xc4, 0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d,
+ 0x33, 0xba, 0x71, 0xa0, 0xf3, 0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d,
+ 0xa5, 0x1c, 0xbb, 0x6c, 0xd9, 0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d,
+ 0x72, 0x3f, 0x66, 0x85, 0x62, 0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f,
+ 0xa3, 0xf9, 0x0f, 0x77, 0x5c, 0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b,
+ 0xe7, 0x5d, 0x63, 0x3a, 0x6f, 0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda,
+ 0x76, 0x9d, 0xe4, 0x63, 0xcd, 0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b,
+ 0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff,
+ 0xfe, 0x05, 0x3b, 0xe0, 0x3d, 0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94,
+ 0x69, 0xcb, 0x29, 0x79, 0xa0, 0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a,
+ 0x6d, 0x8d, 0xd4, 0xc6, 0xe5, 0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d,
+ 0x68, 0xc7, 0x44, 0xb2, 0x6b, 0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18,
+ 0x97, 0x1f, 0x12, 0xd1, 0xdb, 0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d,
+ 0x13, 0x5e, 0xda, 0xee, 0x84, 0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f,
+ 0xe9, 0xf7, 0x2b, 0xda, 0xde, 0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c,
+ 0x7f, 0xf0, 0x86, 0x0c, 0xe8, 0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a,
+ 0x61, 0x35, 0x12, 0x62, 0x3e, 0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99,
+ 0xdd, 0x38, 0xd0, 0x79, 0xbc, 0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2,
+ 0x8e, 0x5d, 0xb6, 0x6c, 0xd3, 0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9,
+ 0x1f, 0xb3, 0x42, 0xb1, 0x4f, 0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1,
+ 0xfc, 0x87, 0xbb, 0xae, 0x53, 0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3,
+ 0xae, 0xb1, 0x9d, 0x37, 0xfa, 0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b,
+ 0x4e, 0xf2, 0x31, 0xe6, 0xea, 0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4,
+ 0x1a, 0x53, 0xa9, 0x8a, 0xb3, 0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff,
+ 0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol11) == kNumBytesTestReadSymbol11, "");
+
+// The kBytesTestReadSymbol12[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][13] = {
+// // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+// // 1/12
+// { 32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+// 32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+// 32768 - 27307, 32768 - 30037, 0, 0 },
+// // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+// // 1/24
+// { 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+// 32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+// 32768 - 28672, 32768 - 31403, 0, 0 },
+// // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+// // 3/24
+// { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+// 32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+// 32768 - 25941, 32768 - 28672, 0, 0 },
+// // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24, 2/24,
+// // 1/24
+// { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+// 32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+// 32768 - 28672, 32768 - 31403, 0, 0 },
+// };
+// constexpr int kSymbols[24][4] = { { 0, 6, 11, 5 }, //
+// { 1, 7, 10, 4 }, //
+// { 2, 8, 9, 3 }, //
+// { 3, 9, 8, 2 }, //
+// { 4, 10, 7, 1 }, //
+// { 5, 11, 6, 0 }, //
+// { 6, 0, 5, 11 }, //
+// { 7, 1, 4, 10 }, //
+// { 8, 2, 3, 9 }, //
+// { 9, 3, 2, 8 }, //
+// { 10, 4, 1, 7 }, //
+// { 11, 5, 0, 6 }, //
+// { 0, 0, 11, 9 }, //
+// { 2, 1, 10, 7 }, //
+// { 4, 3, 8, 5 }, //
+// { 6, 5, 6, 3 }, //
+// { 8, 7, 4, 1 }, //
+// { 10, 9, 2, 10 }, //
+// { 1, 0, 11, 8 }, //
+// { 3, 2, 9, 6 }, //
+// { 5, 4, 7, 4 }, //
+// { 7, 6, 5, 2 }, //
+// { 9, 8, 3, 6 }, //
+// { 11, 10, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 80; ++i) {
+// for (int j = 0; j < 24; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 12);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol12 = 3473;
+constexpr uint8_t kBytesTestReadSymbol12[] = {
+ 0x0d, 0x17, 0xf5, 0xbd, 0x05, 0xd0, 0x9c, 0x5d, 0x10, 0xc5, 0x9e, 0xc4,
+ 0x9f, 0xc6, 0xf4, 0x7d, 0xce, 0x67, 0x97, 0x49, 0xd1, 0x05, 0x54, 0xab,
+ 0xda, 0x22, 0x5b, 0xbc, 0x9c, 0x11, 0xc8, 0x0b, 0xe9, 0x6d, 0xb1, 0x8a,
+ 0x17, 0x06, 0x92, 0xed, 0xd4, 0x61, 0x48, 0x01, 0x64, 0x43, 0x65, 0x65,
+ 0xfc, 0x35, 0x9d, 0xbb, 0x68, 0x3f, 0x77, 0xbc, 0x8d, 0xd9, 0x3b, 0x48,
+ 0x77, 0x58, 0x2f, 0x19, 0xfa, 0x73, 0xa6, 0xc3, 0x65, 0x96, 0x6c, 0x9d,
+ 0x99, 0xb8, 0x65, 0x2b, 0x94, 0x11, 0x21, 0xf4, 0x95, 0xa4, 0xcd, 0xf2,
+ 0xbf, 0x65, 0x79, 0x34, 0x4b, 0xf6, 0x5c, 0xeb, 0xca, 0x07, 0x65, 0x4f,
+ 0xae, 0x67, 0xd8, 0xdf, 0xec, 0xc9, 0xd2, 0x26, 0x2e, 0xac, 0xea, 0xa2,
+ 0xbd, 0x0d, 0x79, 0x27, 0x91, 0xf5, 0x84, 0x89, 0xf9, 0x2a, 0xb3, 0x5e,
+ 0x48, 0x4b, 0x2b, 0x89, 0xc0, 0xa5, 0x9f, 0x94, 0x07, 0x82, 0x36, 0x11,
+ 0x65, 0x4d, 0xb0, 0xde, 0xac, 0xde, 0xac, 0xc0, 0x35, 0x7f, 0xf3, 0x9b,
+ 0x01, 0x0c, 0x35, 0x8b, 0xb5, 0x22, 0xb8, 0xea, 0x1c, 0xab, 0xbe, 0x08,
+ 0xd9, 0x23, 0x0a, 0x37, 0x95, 0x36, 0x3d, 0x28, 0xb3, 0x19, 0x34, 0x3a,
+ 0x47, 0xf8, 0x45, 0x33, 0x7a, 0x65, 0xae, 0x80, 0x48, 0x01, 0x20, 0xe8,
+ 0xcd, 0xb7, 0xce, 0xf7, 0xee, 0xd1, 0x50, 0x39, 0xec, 0xa6, 0x8b, 0xa0,
+ 0xb5, 0x56, 0x76, 0x1a, 0xb4, 0x6b, 0x31, 0xcf, 0x32, 0x0f, 0xb1, 0xba,
+ 0xb3, 0xa4, 0xb7, 0x34, 0xfe, 0x86, 0x87, 0xa7, 0x44, 0x70, 0x3b, 0x9e,
+ 0x94, 0xc5, 0x43, 0x82, 0xf1, 0x1a, 0xa1, 0x10, 0x05, 0x7c, 0x04, 0x63,
+ 0x5a, 0xfe, 0xc2, 0xb6, 0x15, 0x07, 0x3f, 0xb0, 0x3c, 0x43, 0x74, 0x33,
+ 0xec, 0xb8, 0xe0, 0xf5, 0x79, 0x48, 0x7c, 0x50, 0x4f, 0x4b, 0xb9, 0x08,
+ 0x33, 0xfd, 0x54, 0xd5, 0x6f, 0xdf, 0xca, 0xfe, 0x38, 0xa1, 0xeb, 0xa9,
+ 0xaf, 0xa5, 0x8f, 0xcf, 0xb3, 0xda, 0x77, 0x3f, 0x63, 0xcb, 0x98, 0x2b,
+ 0x71, 0x56, 0x60, 0xb4, 0x5c, 0x7d, 0x81, 0x85, 0xf3, 0x64, 0x9f, 0xf3,
+ 0xc2, 0xec, 0x2a, 0x27, 0x9b, 0x5e, 0x39, 0x30, 0x10, 0x0d, 0x43, 0xdb,
+ 0x9f, 0x7b, 0x8f, 0xb8, 0x09, 0xe2, 0x55, 0xb3, 0xc4, 0xb1, 0xeb, 0x23,
+ 0xcd, 0x32, 0xde, 0x58, 0xc2, 0x35, 0xda, 0x5c, 0x9a, 0xf8, 0x2d, 0xc6,
+ 0x19, 0x46, 0x64, 0x66, 0x5a, 0xdb, 0x53, 0xc8, 0x14, 0x41, 0xcc, 0x0c,
+ 0x3f, 0xff, 0x3e, 0xbe, 0x29, 0xba, 0x5f, 0x68, 0xa9, 0x31, 0x39, 0x79,
+ 0x2a, 0xfe, 0x14, 0x92, 0x8f, 0x2b, 0x31, 0xf1, 0x0a, 0x25, 0xd8, 0x22,
+ 0xe1, 0xc7, 0xcd, 0xda, 0xea, 0x88, 0xfa, 0x6a, 0xb0, 0x69, 0x77, 0xf6,
+ 0xd6, 0x46, 0xb9, 0xe6, 0x53, 0x09, 0x48, 0x65, 0xbd, 0xe6, 0xf8, 0xc0,
+ 0x04, 0x71, 0x26, 0x21, 0xe8, 0xf9, 0xc1, 0x71, 0x73, 0x6b, 0x3d, 0x73,
+ 0x16, 0x66, 0x38, 0xae, 0x59, 0xb9, 0xe3, 0x34, 0x8f, 0x17, 0x3c, 0x16,
+ 0xaa, 0x3f, 0x61, 0x49, 0xb3, 0x06, 0xcc, 0xb3, 0xcb, 0x7e, 0x42, 0xf1,
+ 0x2a, 0x0e, 0xb2, 0xcb, 0x1d, 0xf0, 0x0f, 0xc9, 0x20, 0xb1, 0x80, 0xce,
+ 0x08, 0xb9, 0xfa, 0xca, 0x3c, 0xd5, 0x67, 0x47, 0x36, 0x17, 0xc1, 0xf7,
+ 0x9d, 0x97, 0x79, 0x75, 0xee, 0xb0, 0xed, 0xfc, 0xd0, 0xdf, 0xc8, 0xa2,
+ 0xc1, 0xae, 0x51, 0x53, 0x88, 0x05, 0x95, 0x73, 0x7e, 0xd9, 0x3b, 0x9d,
+ 0xb0, 0x08, 0x37, 0xff, 0x51, 0x6f, 0xf9, 0xad, 0x60, 0xa5, 0x3a, 0xd6,
+ 0xba, 0xea, 0xf6, 0xea, 0x91, 0x2e, 0x5a, 0xa9, 0xbf, 0xe2, 0x52, 0x46,
+ 0x0c, 0xbd, 0x28, 0x2d, 0xa8, 0x5f, 0xc8, 0x41, 0x31, 0x53, 0x7a, 0x9f,
+ 0xfa, 0x73, 0x06, 0xc5, 0xae, 0x59, 0x8d, 0xe3, 0x0d, 0xfa, 0x99, 0x7f,
+ 0xee, 0xe4, 0x82, 0xd4, 0x36, 0x68, 0x09, 0x92, 0x09, 0xef, 0x70, 0x89,
+ 0xc6, 0xfa, 0xc7, 0x7e, 0x0f, 0x24, 0x8e, 0xad, 0x4e, 0xd9, 0x4c, 0x11,
+ 0xe7, 0x7d, 0x98, 0xf0, 0x80, 0x42, 0x0b, 0x86, 0x8d, 0x8e, 0x85, 0x97,
+ 0xd2, 0x11, 0x0f, 0x04, 0x59, 0xaf, 0xa5, 0xec, 0xda, 0x75, 0x64, 0x51,
+ 0x22, 0x7e, 0x38, 0x4b, 0xca, 0x9e, 0x82, 0x71, 0x72, 0x8d, 0x4c, 0xca,
+ 0xe1, 0x77, 0xe5, 0xe0, 0x9d, 0x64, 0x01, 0x48, 0x49, 0xcd, 0x3b, 0x90,
+ 0xd8, 0x9e, 0x15, 0x22, 0x76, 0xe0, 0x57, 0x06, 0x06, 0xaf, 0x2c, 0x09,
+ 0xce, 0x4c, 0xfa, 0x8b, 0xbf, 0xa1, 0x1b, 0xe3, 0xe7, 0xa5, 0xa0, 0xc0,
+ 0xc8, 0x4c, 0x79, 0x1b, 0xeb, 0x5d, 0xb8, 0x3b, 0x1c, 0x3f, 0xbc, 0x11,
+ 0x8f, 0xa0, 0x08, 0x2b, 0xd3, 0xe3, 0xca, 0xbc, 0x41, 0xc2, 0xa4, 0x4e,
+ 0xdc, 0x0a, 0xe1, 0x06, 0xef, 0x55, 0x13, 0xb3, 0xdd, 0xfd, 0xe2, 0x89,
+ 0x5f, 0xb5, 0xf6, 0xa9, 0xd7, 0xae, 0xc1, 0x14, 0xb6, 0x19, 0xd8, 0x5b,
+ 0x0f, 0x9a, 0xb0, 0xed, 0xc5, 0xc7, 0xa8, 0xa6, 0x08, 0x5a, 0x00, 0xad,
+ 0xf5, 0x9c, 0xb9, 0xd9, 0x45, 0x46, 0xf0, 0x9e, 0x2d, 0x55, 0xc6, 0x08,
+ 0x60, 0x0d, 0x9e, 0xa7, 0x68, 0xb6, 0xf7, 0xf3, 0xa9, 0x84, 0x7e, 0x63,
+ 0xe8, 0x48, 0x03, 0x1c, 0x15, 0x97, 0x94, 0xda, 0x04, 0xb2, 0xd0, 0x09,
+ 0xa5, 0x62, 0x21, 0x70, 0x88, 0x9f, 0xf5, 0x0c, 0x91, 0x0d, 0xbf, 0x69,
+ 0xe1, 0x6b, 0x4f, 0xc2, 0xf2, 0x32, 0xe1, 0x4b, 0xad, 0x58, 0xea, 0x0c,
+ 0x07, 0x13, 0x4a, 0x1b, 0x87, 0x6d, 0x6e, 0x2f, 0xb6, 0xc6, 0x30, 0x1e,
+ 0x2d, 0x1d, 0x5c, 0xdf, 0xd2, 0x5a, 0x88, 0xc8, 0x1c, 0xd9, 0xc3, 0x91,
+ 0x04, 0x45, 0x63, 0x11, 0x44, 0x35, 0x7f, 0x46, 0xf4, 0xd0, 0xd1, 0x73,
+ 0x9c, 0xae, 0x85, 0x5e, 0xda, 0xc7, 0xce, 0xb5, 0xbb, 0x3a, 0xb4, 0x67,
+ 0xa5, 0xad, 0xc6, 0x5e, 0x12, 0xc7, 0xc5, 0x72, 0xfc, 0x35, 0x2e, 0xae,
+ 0x46, 0x81, 0x22, 0x56, 0x6d, 0xc9, 0x36, 0x43, 0x17, 0x6b, 0x4d, 0x81,
+ 0xd6, 0x59, 0x35, 0x90, 0x3a, 0xd2, 0xde, 0x79, 0xbd, 0x21, 0xc4, 0x56,
+ 0xcb, 0x59, 0x3b, 0xe7, 0xb3, 0xab, 0x92, 0xce, 0x65, 0xc7, 0x20, 0xde,
+ 0xde, 0xb1, 0x94, 0xac, 0x1a, 0x23, 0xa4, 0x14, 0x56, 0x32, 0xc0, 0x9f,
+ 0x48, 0x31, 0xa6, 0x95, 0xc4, 0xb8, 0xf3, 0x9c, 0x8d, 0x34, 0x03, 0xc3,
+ 0x62, 0x63, 0x38, 0x15, 0x71, 0x08, 0x5e, 0x1b, 0xc0, 0xf2, 0x54, 0x13,
+ 0x66, 0x01, 0xf1, 0x38, 0xd9, 0x61, 0xf3, 0xdb, 0xd4, 0x83, 0x98, 0x3e,
+ 0xaa, 0xe1, 0xca, 0x2d, 0xfb, 0x6d, 0x02, 0xac, 0xf2, 0xa6, 0x04, 0x09,
+ 0xeb, 0xcb, 0xaf, 0xd5, 0x9d, 0x3d, 0xd7, 0xc2, 0xc1, 0x6f, 0xec, 0x53,
+ 0x65, 0x0e, 0x40, 0x77, 0x03, 0xcd, 0x79, 0x0a, 0x94, 0x27, 0x6b, 0x6f,
+ 0x32, 0xb3, 0xdb, 0x3e, 0x38, 0xe2, 0xd2, 0xca, 0x9b, 0x9e, 0x24, 0xc7,
+ 0x35, 0xfd, 0xc1, 0x86, 0x78, 0xd9, 0xc3, 0xfe, 0x03, 0xb3, 0x3f, 0xc1,
+ 0xf8, 0x09, 0x89, 0xdc, 0x3b, 0x08, 0xae, 0x85, 0xfa, 0x8e, 0x51, 0xbb,
+ 0x6f, 0xf4, 0x73, 0x43, 0xd2, 0xed, 0x6d, 0xfd, 0x2b, 0x23, 0xc3, 0x4f,
+ 0xc4, 0x1d, 0x25, 0xb9, 0x36, 0xc4, 0x98, 0xe6, 0xbf, 0xb8, 0x30, 0xcf,
+ 0x1b, 0x38, 0x7f, 0xc0, 0x76, 0x67, 0xf8, 0x3f, 0x01, 0x31, 0x3b, 0x87,
+ 0x60, 0xf9, 0x90, 0x01, 0x2c, 0x2f, 0xff, 0x6d, 0xfc, 0x8c, 0x3e, 0xeb,
+ 0x7f, 0x96, 0x41, 0x82, 0xfd, 0xc6, 0x93, 0x8d, 0xfa, 0x4e, 0x48, 0x49,
+ 0x33, 0x3a, 0xa3, 0x5e, 0x61, 0xdf, 0x88, 0x73, 0x66, 0x04, 0xf5, 0xe5,
+ 0xd7, 0xea, 0xce, 0x9e, 0xeb, 0xe1, 0x60, 0xb7, 0xf1, 0xcc, 0x0d, 0xc1,
+ 0xc4, 0xa0, 0x22, 0x0d, 0xe5, 0x8c, 0x8e, 0x26, 0xf9, 0x89, 0xa5, 0x02,
+ 0xf6, 0x4c, 0x3f, 0x10, 0x74, 0x96, 0xe4, 0xdb, 0x12, 0x63, 0x9a, 0xfe,
+ 0x70, 0x4e, 0x9a, 0x97, 0xc8, 0xad, 0x5f, 0x39, 0xa0, 0x81, 0x6a, 0xc4,
+ 0x93, 0x50, 0x94, 0x1e, 0x17, 0xe3, 0x3f, 0x6d, 0x91, 0x01, 0xed, 0x49,
+ 0x96, 0xed, 0x01, 0xc2, 0x2a, 0xe1, 0xc9, 0x39, 0x76, 0x1f, 0x87, 0xb6,
+ 0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xbf, 0x82, 0xa3,
+ 0x6d, 0x87, 0x72, 0x2c, 0x7c, 0xdc, 0x3f, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0,
+ 0x0e, 0xc3, 0xdc, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x54, 0xab, 0xe3,
+ 0xd6, 0x47, 0x90, 0x61, 0x87, 0x66, 0x08, 0x63, 0x95, 0x25, 0x20, 0x43,
+ 0x6e, 0x05, 0x80, 0xad, 0x01, 0x10, 0xc7, 0x6c, 0x04, 0xbe, 0xaf, 0xc5,
+ 0x50, 0xa7, 0x48, 0x4a, 0x47, 0x44, 0x71, 0xc9, 0xa5, 0xdb, 0xa2, 0x2b,
+ 0x12, 0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0,
+ 0xb4, 0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7e,
+ 0xfe, 0x0a, 0x8d, 0xb6, 0x1d, 0xc8, 0xb1, 0xf3, 0x70, 0xfc, 0xad, 0xab,
+ 0xc6, 0x6b, 0x80, 0xc8, 0xbb, 0x74, 0x45, 0x62, 0x57, 0x88, 0x07, 0x26,
+ 0x2d, 0x30, 0x60, 0x77, 0x34, 0x08, 0xde, 0x16, 0x89, 0x63, 0x71, 0xbb,
+ 0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52,
+ 0x53, 0x83, 0x53, 0xf6, 0x9e, 0x15, 0xb5, 0x78, 0xcd, 0x70, 0x19, 0x17,
+ 0x6e, 0x88, 0xac, 0x4a, 0xf1, 0x00, 0xe4, 0xc5, 0xa6, 0x0c, 0x0e, 0xe6,
+ 0x81, 0x1b, 0xc2, 0xd1, 0x2c, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed,
+ 0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x4a, 0x70, 0x6a, 0x7e, 0xd3,
+ 0xc2, 0xb6, 0xaf, 0x19, 0xae, 0x03, 0x22, 0xed, 0xd1, 0x15, 0x89, 0x5e,
+ 0x20, 0x1c, 0x98, 0xb4, 0xc1, 0x81, 0xdc, 0xd0, 0x23, 0x78, 0x5a, 0x25,
+ 0x8d, 0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5,
+ 0x48, 0x5d, 0x49, 0x4e, 0x0d, 0x4f, 0xda, 0x78, 0x56, 0xd5, 0xe3, 0x35,
+ 0xc0, 0x64, 0x5d, 0xba, 0x22, 0xb1, 0x2b, 0xc4, 0x03, 0x93, 0x16, 0x98,
+ 0x30, 0x3b, 0x9a, 0x04, 0x6f, 0x0b, 0x44, 0xb1, 0xb8, 0xdd, 0xa8, 0x71,
+ 0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x29, 0xc1,
+ 0xa9, 0xfb, 0x4f, 0x0a, 0xda, 0xbc, 0x66, 0xb8, 0x0c, 0x8b, 0xb7, 0x44,
+ 0x56, 0x25, 0x78, 0x80, 0x72, 0x62, 0xd3, 0x06, 0x07, 0x73, 0x40, 0x8d,
+ 0xe1, 0x68, 0x96, 0x37, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56,
+ 0xfe, 0x0a, 0xd5, 0x21, 0x75, 0x25, 0x38, 0x35, 0x3f, 0x69, 0xe1, 0x5b,
+ 0x57, 0x8c, 0xd7, 0x01, 0x91, 0x76, 0xe8, 0x8a, 0xc4, 0xaf, 0x10, 0x0e,
+ 0x4c, 0x5a, 0x60, 0xc0, 0xee, 0x68, 0x11, 0xbc, 0x2d, 0x12, 0xc6, 0xe3,
+ 0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e,
+ 0xa4, 0xa7, 0x06, 0xa7, 0xed, 0x3c, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0, 0x32,
+ 0x2e, 0xdd, 0x11, 0x58, 0x95, 0xe2, 0x01, 0xc9, 0x8b, 0x4c, 0x18, 0x1d,
+ 0xcd, 0x02, 0x37, 0x85, 0xa2, 0x58, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03,
+ 0xdb, 0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x94, 0xe0, 0xd4, 0xfd,
+ 0xa7, 0x85, 0x6d, 0x5e, 0x33, 0x5c, 0x06, 0x45, 0xdb, 0xa2, 0x2b, 0x12,
+ 0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0, 0xb4,
+ 0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05,
+ 0x6a, 0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45,
+ 0xee, 0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47,
+ 0x09, 0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50,
+ 0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d,
+ 0x50, 0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36,
+ 0x0c, 0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec,
+ 0xc6, 0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2,
+ 0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f,
+ 0x4d, 0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1,
+ 0x17, 0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d,
+ 0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48,
+ 0x5d, 0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62,
+ 0xfe, 0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd,
+ 0xae, 0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96,
+ 0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36,
+ 0x83, 0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f,
+ 0xf1, 0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60,
+ 0xfc, 0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe,
+ 0x0a, 0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86,
+ 0x8b, 0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2,
+ 0x8e, 0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76,
+ 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4,
+ 0x9a, 0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78,
+ 0x6c, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19,
+ 0xd9, 0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb,
+ 0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9,
+ 0x3e, 0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97,
+ 0xc2, 0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e,
+ 0xdb, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a,
+ 0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee,
+ 0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09,
+ 0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3,
+ 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50,
+ 0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c,
+ 0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6,
+ 0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad,
+ 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d,
+ 0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17,
+ 0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6,
+ 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d,
+ 0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe,
+ 0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae,
+ 0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07,
+ 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83,
+ 0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1,
+ 0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc,
+ 0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a,
+ 0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b,
+ 0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e,
+ 0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1,
+ 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a,
+ 0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c,
+ 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9,
+ 0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5,
+ 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e,
+ 0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2,
+ 0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb,
+ 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90,
+ 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5,
+ 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b,
+ 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c,
+ 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d,
+ 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f,
+ 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1,
+ 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc,
+ 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d,
+ 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65,
+ 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed,
+ 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49,
+ 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0,
+ 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33,
+ 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7,
+ 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2,
+ 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f,
+ 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d,
+ 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5,
+ 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd,
+ 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12,
+ 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6,
+ 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0,
+ 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18,
+ 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d,
+ 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b,
+ 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a,
+ 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e,
+ 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d,
+ 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba,
+ 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd,
+ 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c,
+ 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f,
+ 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07,
+ 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2,
+ 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8,
+ 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15,
+ 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17,
+ 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c,
+ 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43,
+ 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35,
+ 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8,
+ 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3,
+ 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a,
+ 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d,
+ 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84,
+ 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7,
+ 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21,
+ 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b,
+ 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6,
+ 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58,
+ 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda,
+ 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f,
+ 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83,
+ 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8,
+ 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a,
+ 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca,
+ 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda,
+ 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92,
+ 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1,
+ 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67,
+ 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e,
+ 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4,
+ 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f,
+ 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b,
+ 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa,
+ 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb,
+ 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25,
+ 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c,
+ 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41,
+ 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30,
+ 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b,
+ 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7,
+ 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34,
+ 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d,
+ 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7, 0x1b,
+ 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21, 0x75,
+ 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b, 0xfb,
+ 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6, 0xb8,
+ 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e,
+ 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda, 0x0f,
+ 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f, 0xc4,
+ 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83, 0xf0,
+ 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8, 0x2b,
+ 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a, 0x2f,
+ 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca, 0x38,
+ 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda, 0x87,
+ 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92, 0x6a,
+ 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1, 0xb0,
+ 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67, 0x66,
+ 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95,
+ 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4, 0xfa,
+ 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f, 0x08,
+ 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b, 0x6e,
+ 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42,
+ 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb, 0x17,
+ 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25, 0xed,
+ 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c, 0xb0,
+ 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41, 0xb4,
+ 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30, 0x7f,
+ 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b, 0x07,
+ 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0,
+ 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34, 0x34,
+ 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d, 0x94,
+ 0x70, 0x97, 0xb5, 0xc6, 0x7c,
+};
+static_assert(sizeof(kBytesTestReadSymbol12) == kNumBytesTestReadSymbol12, "");
+
+// The kBytesTestReadSymbol13[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][14] = {
+// // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+// // 1/13, 1/13
+// { 32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+// 32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+// 32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0 },
+// // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+// // 2/26, 1/26
+// { 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+// 32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+// 32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+// // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+// // 2/26, 3/26
+// { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+// 32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+// 32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0 },
+// // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26, 2/26,
+// // 2/26, 1/26
+// { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+// 32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+// 32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+// };
+// constexpr int kSymbols[26][4] = { { 0, 6, 12, 5 }, //
+// { 1, 7, 11, 4 }, //
+// { 2, 8, 10, 3 }, //
+// { 3, 9, 9, 2 }, //
+// { 4, 10, 8, 1 }, //
+// { 5, 11, 7, 0 }, //
+// { 6, 12, 6, 12 }, //
+// { 7, 0, 5, 11 }, //
+// { 8, 1, 4, 10 }, //
+// { 9, 2, 3, 9 }, //
+// { 10, 3, 2, 8 }, //
+// { 11, 4, 1, 7 }, //
+// { 12, 5, 0, 6 }, //
+// { 0, 0, 12, 11 }, //
+// { 2, 1, 10, 9 }, //
+// { 4, 3, 8, 7 }, //
+// { 6, 5, 6, 5 }, //
+// { 8, 7, 4, 3 }, //
+// { 10, 9, 2, 1 }, //
+// { 12, 11, 12, 10 }, //
+// { 1, 0, 11, 8 }, //
+// { 3, 2, 9, 6 }, //
+// { 5, 4, 7, 4 }, //
+// { 7, 6, 5, 2 }, //
+// { 9, 8, 3, 6 }, //
+// { 11, 10, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+// for (int j = 0; j < 26; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 13);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol13 = 3110;
+constexpr uint8_t kBytesTestReadSymbol13[] = {
+ 0x0b, 0x38, 0xa7, 0x3e, 0xde, 0x47, 0x2e, 0xe6, 0x9e, 0xe0, 0xa8, 0xc4,
+ 0x77, 0xda, 0x41, 0x64, 0x49, 0x60, 0xc4, 0x26, 0x68, 0xac, 0xf4, 0xa6,
+ 0x8c, 0x6e, 0xa6, 0xd3, 0xd9, 0x4b, 0xb9, 0x35, 0xb6, 0x53, 0x6c, 0x73,
+ 0x13, 0xd7, 0xfb, 0xbf, 0x96, 0xac, 0xea, 0x86, 0xb5, 0x24, 0x14, 0x2a,
+ 0x5a, 0x41, 0x38, 0xab, 0xfb, 0x92, 0x74, 0xf4, 0x0f, 0x24, 0xde, 0x2d,
+ 0x2d, 0x12, 0xd7, 0xb8, 0x2f, 0x4a, 0x4c, 0xd6, 0xc0, 0x4b, 0x01, 0x98,
+ 0xca, 0x7e, 0xde, 0x03, 0x75, 0x27, 0x59, 0x4f, 0x32, 0x54, 0xa5, 0xb5,
+ 0x79, 0xc3, 0xc4, 0x3c, 0x76, 0xa3, 0x2f, 0xaf, 0x2f, 0x0a, 0x84, 0xb5,
+ 0x60, 0xf5, 0x73, 0x88, 0xc0, 0x24, 0x1c, 0xfb, 0xff, 0x90, 0xb6, 0x05,
+ 0xe9, 0x43, 0x90, 0xc8, 0xd3, 0xfd, 0x3f, 0xc2, 0x0b, 0xb5, 0xfe, 0x12,
+ 0x55, 0x23, 0xa1, 0xf4, 0xba, 0xc7, 0x1f, 0xc3, 0xe5, 0xe3, 0x76, 0x68,
+ 0x3c, 0x57, 0xb9, 0x92, 0xea, 0x25, 0x93, 0x4e, 0x72, 0xff, 0x63, 0x28,
+ 0x0c, 0x90, 0x1d, 0xb6, 0x42, 0xb2, 0x25, 0x79, 0x8e, 0xee, 0x0c, 0x56,
+ 0x3d, 0x94, 0x3d, 0x80, 0xf2, 0x25, 0x6f, 0xd4, 0x93, 0x31, 0x18, 0x80,
+ 0x5a, 0x3a, 0xbb, 0x4d, 0xbb, 0x77, 0xc3, 0xb0, 0x20, 0x0e, 0xd3, 0xd8,
+ 0x10, 0x05, 0xb2, 0x81, 0x57, 0xf5, 0x8c, 0xe5, 0xac, 0x46, 0xc0, 0xae,
+ 0x9c, 0x08, 0x9d, 0x51, 0xf3, 0x16, 0xb9, 0xd7, 0x90, 0xa7, 0x9f, 0x40,
+ 0x5d, 0x14, 0xd1, 0xbd, 0xa2, 0x0b, 0xf3, 0xae, 0x3b, 0xfb, 0x0f, 0xe1,
+ 0x1a, 0x6e, 0x63, 0x3b, 0xdb, 0x41, 0x8e, 0xe8, 0x1f, 0x20, 0x18, 0xbe,
+ 0x69, 0x10, 0x86, 0x06, 0x06, 0x23, 0x3a, 0x40, 0xc1, 0x7f, 0x2e, 0x32,
+ 0xb4, 0x23, 0xac, 0x4b, 0x25, 0x6b, 0xef, 0xaf, 0xec, 0x5c, 0xf2, 0xd0,
+ 0x61, 0xb2, 0x3a, 0xa5, 0x3d, 0xcd, 0xf7, 0x99, 0x6b, 0x4e, 0xbb, 0x58,
+ 0x6a, 0x4c, 0xd7, 0xc0, 0x77, 0xd9, 0xae, 0x15, 0x7e, 0xde, 0xc9, 0xd8,
+ 0x24, 0x39, 0x3f, 0xa4, 0xf3, 0x24, 0x7e, 0xe0, 0x22, 0x19, 0x40, 0x3d,
+ 0x0c, 0xb0, 0xb7, 0xe3, 0x4b, 0x82, 0x6f, 0x82, 0x0e, 0xb1, 0x91, 0xef,
+ 0x84, 0x98, 0x69, 0x66, 0x24, 0xe7, 0x90, 0x13, 0x0d, 0xbd, 0x6b, 0x92,
+ 0xee, 0x1c, 0x0f, 0xe7, 0xfa, 0xb9, 0xb4, 0x6c, 0x68, 0x98, 0x4c, 0x27,
+ 0x42, 0xad, 0x5f, 0x8f, 0xe5, 0x25, 0xf9, 0x67, 0x84, 0x86, 0x2e, 0xf6,
+ 0x51, 0x71, 0x0d, 0x6c, 0x45, 0x8f, 0x96, 0x15, 0x73, 0xab, 0xff, 0xc0,
+ 0x87, 0x14, 0xba, 0x00, 0x67, 0x2c, 0x27, 0x03, 0xff, 0xa6, 0xe3, 0x09,
+ 0xae, 0xbb, 0xa5, 0x49, 0xee, 0x5f, 0x47, 0xc0, 0x30, 0x4a, 0x93, 0x28,
+ 0x48, 0x4d, 0x30, 0x49, 0xe7, 0xe6, 0x79, 0x96, 0x75, 0x6c, 0x62, 0xbc,
+ 0x9f, 0xaa, 0x39, 0x63, 0x1d, 0x33, 0xce, 0xd2, 0xa3, 0xd1, 0x93, 0xed,
+ 0x8d, 0xa6, 0xbd, 0x02, 0xf0, 0x44, 0xd5, 0x9e, 0x29, 0x02, 0x46, 0x87,
+ 0xaf, 0xdb, 0xfb, 0x20, 0x29, 0x26, 0xb7, 0x8c, 0x75, 0xee, 0xe9, 0x29,
+ 0x53, 0x01, 0x4a, 0xaa, 0xc2, 0x9f, 0x6c, 0x30, 0x21, 0x83, 0xa6, 0x09,
+ 0x32, 0x1d, 0xaa, 0x00, 0x6c, 0xea, 0x9c, 0x84, 0x16, 0x16, 0x0c, 0x06,
+ 0xcc, 0xf0, 0x19, 0xce, 0x57, 0xb3, 0x9f, 0x57, 0xf0, 0xdc, 0xda, 0x86,
+ 0x85, 0x2f, 0x09, 0x33, 0x8d, 0x59, 0xb8, 0xc1, 0x08, 0x4c, 0xee, 0xf8,
+ 0x33, 0x3d, 0x23, 0x13, 0x78, 0xa3, 0x98, 0xbf, 0xab, 0xef, 0x15, 0xe2,
+ 0x8d, 0xdb, 0xb4, 0xd0, 0x4b, 0x2f, 0x04, 0x3f, 0x6b, 0x11, 0xf0, 0x05,
+ 0xc7, 0x53, 0x1e, 0xc9, 0x73, 0x11, 0x81, 0xd3, 0xde, 0x21, 0xd8, 0x14,
+ 0x10, 0xbe, 0x30, 0xb2, 0x48, 0x55, 0x9b, 0x8c, 0x10, 0x84, 0xce, 0xef,
+ 0x83, 0x2f, 0x03, 0x10, 0x09, 0x0f, 0x70, 0xa8, 0x84, 0xea, 0x15, 0xdb,
+ 0xc7, 0xdf, 0x6f, 0x67, 0x5d, 0x1c, 0xc7, 0x1a, 0x1c, 0x15, 0xa6, 0x92,
+ 0xed, 0x63, 0xf0, 0xed, 0x77, 0x5d, 0x12, 0x1b, 0x8c, 0xab, 0x3e, 0xfa,
+ 0x12, 0xf6, 0x83, 0xda, 0x41, 0xbc, 0x97, 0x76, 0xb9, 0x1f, 0xc9, 0x36,
+ 0xc7, 0xe3, 0x9f, 0x93, 0x2e, 0x27, 0xdc, 0x90, 0x84, 0x6d, 0x81, 0x04,
+ 0x09, 0x4f, 0x10, 0xb9, 0x53, 0xd9, 0x8f, 0x99, 0x2b, 0x8b, 0x53, 0x4f,
+ 0xe8, 0x3e, 0x82, 0x1b, 0x0c, 0x3d, 0xbc, 0xe5, 0x5c, 0x13, 0xed, 0x4b,
+ 0x0b, 0x05, 0x72, 0xaa, 0xd2, 0xcf, 0xfc, 0x9f, 0xd0, 0xfd, 0xc7, 0xc6,
+ 0xc0, 0xa3, 0xa7, 0x05, 0xbb, 0x9e, 0xae, 0x63, 0xc0, 0x3d, 0x73, 0x92,
+ 0xe1, 0x98, 0xe4, 0xa5, 0xb3, 0xc4, 0x36, 0x90, 0x35, 0x6b, 0xab, 0x35,
+ 0x06, 0x98, 0xca, 0x35, 0x20, 0x5a, 0x6a, 0x84, 0x5c, 0x88, 0xca, 0x64,
+ 0x43, 0x87, 0xf2, 0x3c, 0x13, 0x58, 0x1c, 0x35, 0x2c, 0xf2, 0x1d, 0x5e,
+ 0xe0, 0x1b, 0x2c, 0x59, 0xc2, 0xcd, 0xf2, 0x96, 0x1a, 0x75, 0x3c, 0x10,
+ 0xe7, 0xe3, 0xa1, 0xbc, 0xec, 0x03, 0x79, 0x58, 0x26, 0x4d, 0xcf, 0xb4,
+ 0x00, 0xd3, 0x46, 0xee, 0x99, 0x52, 0x2f, 0x54, 0xcb, 0xa1, 0x75, 0xa1,
+ 0xa0, 0xf4, 0xaa, 0xe9, 0x4a, 0xe1, 0x74, 0xcc, 0xd1, 0x47, 0xda, 0x48,
+ 0x8b, 0x2e, 0xf9, 0x54, 0x98, 0x4e, 0x4f, 0x5a, 0x1b, 0xf5, 0x66, 0x62,
+ 0xa0, 0xc2, 0x0e, 0x1a, 0x91, 0xbd, 0x7a, 0x33, 0xfd, 0x7c, 0xfc, 0x8b,
+ 0xc0, 0x92, 0xd8, 0x97, 0x48, 0x6f, 0xf4, 0xe0, 0x6c, 0xcf, 0x17, 0xc9,
+ 0x44, 0x04, 0xcf, 0x50, 0x0d, 0x8f, 0xbc, 0x4f, 0x4e, 0x1d, 0x38, 0x38,
+ 0x5c, 0xb7, 0x8e, 0xe7, 0x52, 0xbe, 0x04, 0x68, 0x79, 0x9e, 0x68, 0x32,
+ 0x3b, 0xe4, 0xee, 0x65, 0x76, 0xf6, 0xb4, 0x47, 0x1c, 0xa5, 0xd0, 0x20,
+ 0x0f, 0x94, 0xe1, 0x2f, 0xa8, 0x87, 0xeb, 0xda, 0x2c, 0x54, 0xc4, 0x07,
+ 0x08, 0x89, 0xdc, 0xcf, 0x73, 0x0c, 0x1f, 0xea, 0xb4, 0x6d, 0xea, 0x17,
+ 0x70, 0x82, 0xb5, 0x18, 0x2f, 0x38, 0xc5, 0x47, 0x47, 0xd6, 0x37, 0x20,
+ 0x8d, 0x71, 0xd6, 0x16, 0x4d, 0x16, 0xd5, 0x77, 0x36, 0xb5, 0xd0, 0x20,
+ 0x5f, 0x4d, 0x89, 0x6c, 0x49, 0xc4, 0x13, 0x6c, 0x26, 0x8c, 0x8f, 0x6f,
+ 0x17, 0xab, 0xdf, 0x57, 0xa8, 0xab, 0xed, 0x8d, 0xa9, 0x00, 0x6b, 0xfc,
+ 0xf6, 0x72, 0xaf, 0x32, 0xc2, 0x0b, 0xb6, 0x6b, 0x7a, 0xac, 0xa9, 0x77,
+ 0x52, 0x87, 0x98, 0x43, 0x21, 0x72, 0x35, 0x6c, 0x27, 0x12, 0xbe, 0xf0,
+ 0x62, 0x16, 0x2a, 0xc6, 0xf7, 0x48, 0xd2, 0xc3, 0x25, 0xb4, 0x6a, 0x57,
+ 0x65, 0xd6, 0x07, 0xa0, 0xde, 0x9f, 0x3b, 0x3d, 0xdd, 0x27, 0x0e, 0x4c,
+ 0xe8, 0x4b, 0xe1, 0xd6, 0x33, 0xa7, 0x85, 0x75, 0x44, 0x7e, 0xf9, 0xfd,
+ 0xb9, 0x98, 0xa8, 0x30, 0x82, 0xdf, 0xd9, 0x97, 0x5c, 0x3f, 0x52, 0x20,
+ 0xd4, 0x38, 0x88, 0xc1, 0x53, 0x11, 0x14, 0x25, 0x6f, 0xeb, 0x4e, 0xf5,
+ 0xed, 0xf4, 0xba, 0x34, 0x23, 0x74, 0xbc, 0x46, 0x51, 0x96, 0x1b, 0x50,
+ 0x32, 0x03, 0xe5, 0x6d, 0xd7, 0xcf, 0xca, 0x60, 0xb2, 0xbc, 0xb6, 0x4b,
+ 0xc0, 0xee, 0x8b, 0x96, 0xa9, 0x4c, 0x1d, 0x9b, 0x2d, 0x11, 0xc7, 0x29,
+ 0x74, 0x08, 0x03, 0xe5, 0x1c, 0xe2, 0x6c, 0x21, 0x1e, 0x02, 0x4d, 0xb1,
+ 0x4e, 0x70, 0xb3, 0xfc, 0x06, 0xa5, 0xf9, 0xfb, 0x35, 0x1c, 0x89, 0xe3,
+ 0x1e, 0x27, 0xe0, 0x93, 0xd6, 0xd5, 0x15, 0x94, 0x40, 0x88, 0x71, 0xfd,
+ 0xaa, 0xbd, 0xf6, 0xae, 0x61, 0x52, 0x49, 0x33, 0x99, 0x85, 0xcd, 0x13,
+ 0x70, 0x7e, 0x1b, 0x76, 0x3a, 0x69, 0x9e, 0xfe, 0x3c, 0x65, 0x22, 0xf0,
+ 0x1f, 0x91, 0x57, 0x00, 0x5b, 0x28, 0xac, 0x1e, 0x1e, 0x24, 0xc7, 0xd8,
+ 0xdb, 0x3a, 0xd0, 0x85, 0x04, 0x4d, 0xf7, 0xe8, 0x3b, 0xdc, 0xa1, 0x5b,
+ 0x5e, 0xe3, 0x7a, 0xae, 0x72, 0x70, 0x7c, 0x52, 0x07, 0xf5, 0x1c, 0xda,
+ 0xd7, 0x40, 0x81, 0x7d, 0x36, 0x0a, 0x97, 0x8e, 0x0c, 0x25, 0xe7, 0xd3,
+ 0x81, 0xb0, 0xe2, 0xd0, 0x56, 0x16, 0x9c, 0x9d, 0x0e, 0xc7, 0x97, 0x8f,
+ 0xff, 0x68, 0xd4, 0x4f, 0x1a, 0x4c, 0x58, 0x6f, 0xe4, 0xd5, 0xc1, 0x07,
+ 0x7f, 0x31, 0x8c, 0x59, 0x02, 0x6f, 0xa7, 0x54, 0x1b, 0x02, 0x35, 0xe5,
+ 0x14, 0xec, 0x35, 0x3d, 0x17, 0x72, 0x11, 0x0c, 0x38, 0x62, 0x99, 0x4a,
+ 0x6a, 0x46, 0xcb, 0x36, 0x1b, 0x4b, 0x38, 0xff, 0x1d, 0xa4, 0xf7, 0x21,
+ 0xda, 0x73, 0x42, 0xc4, 0x2b, 0xf8, 0xd8, 0x43, 0x73, 0x60, 0x11, 0x22,
+ 0xc9, 0xe6, 0x07, 0xca, 0xa0, 0x29, 0x2a, 0x20, 0xd9, 0xdd, 0x7d, 0xed,
+ 0x28, 0x10, 0xde, 0xbe, 0x5e, 0xfd, 0x0c, 0x06, 0x4b, 0x1c, 0xc4, 0x56,
+ 0xc4, 0x12, 0x25, 0x5a, 0xd1, 0xfe, 0x03, 0x5e, 0x5e, 0xe0, 0x42, 0x8e,
+ 0x44, 0xf1, 0x8f, 0x13, 0xf0, 0x49, 0xeb, 0x59, 0xf3, 0x5b, 0x61, 0xd9,
+ 0xa4, 0xdf, 0x2e, 0x2a, 0x70, 0xc2, 0xf0, 0xef, 0x16, 0xf4, 0x1b, 0x5c,
+ 0xbd, 0x77, 0x42, 0xb9, 0x4c, 0x56, 0x8d, 0xc8, 0xf8, 0x05, 0xbd, 0x52,
+ 0xba, 0x6e, 0xe1, 0x89, 0xe1, 0xf2, 0xdb, 0xa7, 0xdf, 0xe0, 0xee, 0xc1,
+ 0x5c, 0x9e, 0x90, 0x11, 0x17, 0xd5, 0xc1, 0xb9, 0x2c, 0x08, 0x62, 0x0d,
+ 0x75, 0x05, 0xb2, 0xad, 0x22, 0xd6, 0x5c, 0x6e, 0xed, 0xa4, 0x06, 0x5a,
+ 0x42, 0x4f, 0xbf, 0x84, 0x53, 0xfa, 0x0b, 0xb7, 0x47, 0x6c, 0xba, 0x07,
+ 0xc9, 0xe4, 0x8c, 0xe4, 0xa3, 0x40, 0xdc, 0xcb, 0x58, 0xeb, 0xba, 0xc5,
+ 0xcc, 0x56, 0x74, 0x1e, 0x7b, 0x0f, 0x2a, 0xce, 0x35, 0x46, 0x39, 0x6d,
+ 0x81, 0x91, 0xb2, 0x05, 0x76, 0xfa, 0x8f, 0x43, 0x46, 0x25, 0xb7, 0x98,
+ 0x4e, 0x5f, 0x63, 0xf4, 0x0e, 0x4f, 0x5d, 0x85, 0x29, 0x9d, 0xdb, 0xa8,
+ 0xeb, 0x0a, 0xbb, 0xc4, 0xf8, 0x5a, 0xda, 0xe1, 0x9b, 0x1f, 0x9b, 0x4d,
+ 0x62, 0x65, 0x41, 0x34, 0x5b, 0x6c, 0x19, 0xa5, 0x3c, 0x35, 0x8e, 0x14,
+ 0x02, 0xcd, 0x1d, 0xf3, 0xfb, 0x70, 0x93, 0x46, 0xe2, 0x49, 0xc8, 0x31,
+ 0xfd, 0x47, 0x35, 0xfc, 0x7d, 0xb9, 0x79, 0xf7, 0x0d, 0xed, 0x98, 0x47,
+ 0xd2, 0xcf, 0x26, 0x8b, 0x10, 0x6f, 0x86, 0xca, 0xda, 0xb8, 0x41, 0xdb,
+ 0x0c, 0xc7, 0xc3, 0x56, 0xc5, 0x0f, 0xc7, 0xf2, 0xda, 0x45, 0xdf, 0x94,
+ 0xc1, 0x65, 0x79, 0x6c, 0x97, 0x81, 0xbd, 0xf1, 0x1e, 0x26, 0x6e, 0xfc,
+ 0x4f, 0x2e, 0x1e, 0x9c, 0xa2, 0x69, 0x54, 0x7a, 0xc3, 0x15, 0x44, 0x64,
+ 0x73, 0x11, 0x5b, 0x10, 0x48, 0x95, 0x6b, 0x49, 0x4e, 0xcb, 0x2b, 0x12,
+ 0x90, 0xaf, 0xf5, 0x5a, 0xfa, 0xf5, 0x0b, 0xb8, 0x49, 0x0a, 0x7d, 0xc4,
+ 0x6b, 0x0a, 0xa5, 0x6d, 0x32, 0xb2, 0x33, 0x3c, 0xb3, 0x65, 0x9c, 0x1f,
+ 0x7e, 0x50, 0xd3, 0x6a, 0xa2, 0xc1, 0xb9, 0xd9, 0xfa, 0x25, 0xfe, 0x1c,
+ 0x3f, 0x88, 0x47, 0x0a, 0x7e, 0x62, 0xa2, 0xf3, 0x3e, 0xae, 0x9f, 0x7f,
+ 0x83, 0xbb, 0x05, 0x72, 0x7a, 0x40, 0x44, 0x5f, 0x57, 0x06, 0xe4, 0xb0,
+ 0x21, 0x88, 0x35, 0xd4, 0x16, 0xca, 0xb4, 0x8b, 0x59, 0x71, 0xbb, 0xb6,
+ 0x90, 0x19, 0x69, 0x09, 0x3e, 0xfe, 0x11, 0x4f, 0xe8, 0x2e, 0xdd, 0x1d,
+ 0xb2, 0xe8, 0x1f, 0x27, 0x92, 0x33, 0x92, 0x8d, 0x04, 0x2e, 0x19, 0x16,
+ 0xb4, 0xb5, 0xcf, 0x52, 0x98, 0xcc, 0x2b, 0x85, 0x0c, 0x2d, 0x88, 0x38,
+ 0x24, 0x06, 0xf2, 0x47, 0xec, 0xce, 0xc6, 0xf7, 0x4e, 0xe4, 0x8b, 0xb5,
+ 0x4f, 0xbe, 0xae, 0x13, 0xd5, 0x0c, 0xe6, 0x13, 0x44, 0xa4, 0x76, 0x19,
+ 0x8c, 0x25, 0x28, 0x0f, 0x15, 0x8e, 0xa6, 0x9c, 0xee, 0x6e, 0xf0, 0x55,
+ 0x9d, 0x5a, 0x8f, 0xf6, 0x08, 0x27, 0x92, 0x1f, 0xcb, 0x4c, 0x8c, 0x2c,
+ 0xeb, 0x44, 0x26, 0x48, 0xec, 0x2e, 0x9b, 0xb3, 0xd9, 0x17, 0xee, 0x52,
+ 0x7d, 0x32, 0x47, 0x88, 0x4d, 0xf9, 0x11, 0xfc, 0xac, 0xa3, 0xb0, 0xc9,
+ 0x5e, 0x38, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0,
+ 0x28, 0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c,
+ 0x28, 0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b,
+ 0x0c, 0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a,
+ 0x53, 0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71,
+ 0x84, 0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a,
+ 0xb1, 0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d,
+ 0xba, 0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c,
+ 0x89, 0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85,
+ 0x96, 0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0,
+ 0x45, 0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01,
+ 0x49, 0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7,
+ 0xe8, 0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92,
+ 0x9d, 0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70,
+ 0x92, 0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79,
+ 0x66, 0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87,
+ 0x80, 0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83,
+ 0x81, 0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4,
+ 0xc6, 0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08,
+ 0xef, 0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb,
+ 0xab, 0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c,
+ 0xe3, 0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d,
+ 0x99, 0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b,
+ 0xe6, 0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72,
+ 0x7a, 0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2,
+ 0xd6, 0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d,
+ 0x0d, 0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0,
+ 0x61, 0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19,
+ 0xf2, 0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8,
+ 0x14, 0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48,
+ 0x82, 0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93,
+ 0xf9, 0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00,
+ 0x78, 0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44,
+ 0xab, 0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7,
+ 0xa8, 0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95,
+ 0x91, 0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde,
+ 0xc1, 0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa,
+ 0x77, 0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab,
+ 0xcd, 0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b,
+ 0x42, 0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7,
+ 0x57, 0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88,
+ 0x37, 0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95,
+ 0x83, 0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba,
+ 0x35, 0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44,
+ 0x7e, 0x81, 0xc9, 0xeb, 0xb0, 0xa5, 0x33, 0xbb, 0x75, 0x1d, 0x61, 0x57,
+ 0x78, 0x9f, 0x0b, 0x5b, 0xa7, 0x18, 0x41, 0x68, 0x1b, 0x0d, 0xf6, 0x71,
+ 0x29, 0xfe, 0x74, 0x37, 0x03, 0xab, 0x1d, 0xc4, 0x03, 0x1e, 0x7d, 0xd6,
+ 0xb7, 0x4f, 0xc1, 0x87, 0xd0, 0xdb, 0xad, 0xa6, 0x76, 0x65, 0x64, 0x34,
+ 0x28, 0x0c, 0x67, 0xcb, 0x36, 0xc8, 0x92, 0xec, 0x70, 0xd9, 0x78, 0xab,
+ 0xe1, 0xa3, 0x60, 0x51, 0x08, 0x59, 0x6a, 0x8b, 0xd2, 0x98, 0x55, 0x2d,
+ 0xca, 0x39, 0x22, 0x0a, 0x1b, 0x04, 0x55, 0xa7, 0xec, 0x34, 0xc0, 0xb6,
+ 0xf3, 0xae, 0x4f, 0xe5, 0x50, 0x14, 0x95, 0x10, 0x6e, 0x7b, 0xb9, 0x1d,
+ 0x27, 0x78, 0x01, 0xe0, 0x7c, 0x7e, 0x86, 0x03, 0x25, 0x8e, 0x62, 0x2b,
+ 0x62, 0x09, 0x12, 0xad, 0x69, 0x29, 0xd9, 0x65, 0x62, 0x52, 0x15, 0xfe,
+ 0xab, 0x5f, 0x5e, 0xa1, 0x77, 0x09, 0x21, 0x4f, 0xb8, 0x8d, 0x61, 0x54,
+ 0xad, 0xa6, 0x56, 0x46, 0x67, 0x96, 0x6c, 0xb3, 0x83, 0xef, 0xca, 0x1a,
+ 0x09, 0x6f, 0x7b, 0x04, 0xd8, 0x78, 0x00, 0x51, 0xe8, 0x5d, 0x8b, 0x8f,
+ 0x85, 0x03, 0xe9, 0xdc, 0x18, 0x38, 0x11, 0x55, 0x95, 0xd4, 0x9d, 0x0e,
+ 0xd2, 0x52, 0xaf, 0x35, 0x9e, 0x4c, 0x64, 0xbc, 0x20, 0xf1, 0x99, 0x22,
+ 0xf7, 0xf9, 0x6d, 0x09, 0x00, 0x8e, 0xf4, 0xf5, 0x7a, 0x53, 0xee, 0xcb,
+ 0xea, 0x53, 0x1d, 0x5c, 0xbb, 0xba, 0xbd, 0xe3, 0xbc, 0x86, 0x28, 0xe3,
+ 0x55, 0xb2, 0x20, 0xdf, 0x14, 0xce, 0x38, 0x74, 0x0a, 0x1f, 0x70, 0x59,
+ 0x66, 0x4e, 0x56, 0x0d, 0xa8, 0xd9, 0x9c, 0x53, 0x0a, 0x32, 0xe7, 0xed,
+ 0x56, 0x12, 0xe8, 0xd7, 0x13, 0xbe, 0x63, 0x56, 0xc3, 0x3c, 0x4c, 0xae,
+ 0xf8, 0xed, 0x11, 0xfa, 0x07, 0x27, 0xae, 0xc2, 0x94, 0xce, 0xed, 0xd4,
+ 0x75, 0x85, 0x5d, 0xe2, 0x7c, 0x2d, 0x6e, 0x9c, 0x61, 0x05, 0xa0, 0x6c,
+ 0x37, 0xd9, 0xc4, 0xa7, 0xf9, 0xd0, 0xdc, 0x0e, 0xac, 0x77, 0x10, 0x0c,
+ 0x79, 0xf7, 0x5a, 0xdd, 0x3f, 0x06, 0x1f, 0x43, 0x6e, 0xb6, 0x99, 0xd9,
+ 0x95, 0x90, 0xd0, 0xa0, 0x31, 0x9f, 0x2c, 0xdb, 0x22, 0x4b, 0xb1, 0xc3,
+ 0x65, 0xe2, 0xaf, 0x86, 0x8d, 0x81, 0x44, 0x21, 0x65, 0xaa, 0x2f, 0x4a,
+ 0x61, 0x54, 0xb7, 0x28, 0xe4, 0x88, 0x28, 0x6c, 0x11, 0x56, 0x9f, 0xb0,
+ 0xd3, 0x02, 0xdb, 0xce, 0xb9, 0x3f, 0x95, 0x40, 0x52, 0x54, 0x41, 0xb9,
+ 0xee, 0xe4, 0x74, 0x9d, 0xe0, 0x07, 0x81, 0xf1, 0xfa, 0x18, 0x0c, 0x96,
+ 0x39, 0x88, 0xad, 0x88, 0x24, 0x4a, 0xb5, 0xa4, 0xa7, 0x65, 0x95, 0x89,
+ 0x48, 0x57, 0xfa, 0xad, 0x7d, 0x7a, 0x85, 0xdc, 0x24, 0x85, 0x3e, 0xe2,
+ 0x35, 0x85, 0x52, 0xb6, 0x99, 0x59, 0x19, 0x9e, 0x59, 0xb2, 0xce, 0x0f,
+ 0xbf, 0x28, 0x68, 0x25, 0xbd, 0xec, 0x13, 0x61, 0xe0, 0x01, 0x47, 0xa1,
+ 0x76, 0x2e, 0x3e, 0x14, 0x0f, 0xa7, 0x70, 0x60, 0xe0, 0x45, 0x56, 0x57,
+ 0x52, 0x74, 0x3b, 0x49, 0x4a, 0xbc, 0xd6, 0x79, 0x31, 0x92, 0xf0, 0x83,
+ 0xc6, 0x64, 0x8b, 0xdf, 0xe5, 0xb4, 0x24, 0x02, 0x3b, 0xd3, 0xd5, 0xe9,
+ 0x4f, 0xbb, 0x2f, 0xa9, 0x4c, 0x75, 0x72, 0xee, 0xea, 0xf7, 0x8e, 0xf2,
+ 0x18, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0, 0x28,
+ 0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c, 0x28,
+ 0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b, 0x0c,
+ 0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a, 0x53,
+ 0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71, 0x84,
+ 0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a, 0xb1,
+ 0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d, 0xba,
+ 0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c, 0x89,
+ 0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85, 0x96,
+ 0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0, 0x45,
+ 0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01, 0x49,
+ 0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7, 0xe8,
+ 0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92, 0x9d,
+ 0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70, 0x92,
+ 0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79, 0x66,
+ 0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87, 0x80,
+ 0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83, 0x81,
+ 0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4, 0xc6,
+ 0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08, 0xef,
+ 0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb, 0xab,
+ 0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c, 0xe3,
+ 0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d, 0x99,
+ 0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b, 0xe6,
+ 0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72, 0x7a,
+ 0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2, 0xd6,
+ 0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d, 0x0d,
+ 0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0, 0x61,
+ 0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19, 0xf2,
+ 0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8, 0x14,
+ 0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48, 0x82,
+ 0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93, 0xf9,
+ 0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00, 0x78,
+ 0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44, 0xab,
+ 0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7, 0xa8,
+ 0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95, 0x91,
+ 0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde, 0xc1,
+ 0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa, 0x77,
+ 0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab, 0xcd,
+ 0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b, 0x42,
+ 0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7, 0x57,
+ 0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88, 0x37,
+ 0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95, 0x83,
+ 0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba, 0x35,
+ 0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44, 0x7e,
+ 0x81, 0xca,
+};
+static_assert(sizeof(kBytesTestReadSymbol13) == kNumBytesTestReadSymbol13, "");
+
+// The kBytesTestReadSymbol14[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][15] = {
+// // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+// // 1/14, 1/14, 1/14
+// { 32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+// 32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+// 32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0 },
+// // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+// // 2/28, 2/28, 1/28
+// { 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+// 32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+// 32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+// // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+// // 2/28, 2/28, 3/28
+// { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+// 32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+// 32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0 },
+// // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28, 2/28,
+// // 2/28, 2/28, 1/28
+// { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+// 32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+// 32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+// };
+// constexpr int kSymbols[28][4] = { { 0, 7, 13, 6 }, //
+// { 1, 8, 12, 5 }, //
+// { 2, 9, 11, 4 }, //
+// { 3, 10, 10, 3 }, //
+// { 4, 11, 9, 2 }, //
+// { 5, 12, 8, 1 }, //
+// { 6, 13, 7, 0 }, //
+// { 7, 0, 6, 13 }, //
+// { 8, 1, 5, 12 }, //
+// { 9, 2, 4, 11 }, //
+// { 10, 3, 3, 10 }, //
+// { 11, 4, 2, 9 }, //
+// { 12, 5, 1, 8 }, //
+// { 13, 6, 0, 7 }, //
+// { 0, 0, 13, 11 }, //
+// { 2, 1, 12, 9 }, //
+// { 4, 3, 10, 7 }, //
+// { 6, 5, 8, 5 }, //
+// { 8, 7, 6, 3 }, //
+// { 10, 9, 4, 1 }, //
+// { 12, 11, 2, 12 }, //
+// { 1, 0, 13, 10 }, //
+// { 3, 2, 11, 8 }, //
+// { 5, 4, 9, 6 }, //
+// { 7, 6, 7, 4 }, //
+// { 9, 8, 5, 2 }, //
+// { 11, 10, 3, 7 }, //
+// { 13, 12, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+// for (int j = 0; j < 28; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 14);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol14 = 3455;
+constexpr uint8_t kBytesTestReadSymbol14[] = {
+ 0x0a, 0xef, 0xeb, 0xb5, 0x78, 0x91, 0x0b, 0x9d, 0xee, 0x99, 0x14, 0x9c,
+ 0xf4, 0x58, 0x86, 0xe8, 0x69, 0x7f, 0x06, 0x07, 0x60, 0xb0, 0x79, 0xbe,
+ 0xea, 0xe5, 0x69, 0x1c, 0x67, 0x7a, 0x75, 0x91, 0x2f, 0x1d, 0x49, 0x4e,
+ 0x15, 0x40, 0x56, 0x15, 0xa1, 0xff, 0x72, 0x2d, 0xa5, 0x40, 0x81, 0x21,
+ 0x3d, 0x06, 0x78, 0xd2, 0x62, 0x8a, 0xf2, 0x63, 0x50, 0x9d, 0xbd, 0xa0,
+ 0xd4, 0x14, 0x42, 0x76, 0x4f, 0x44, 0xbe, 0xb2, 0xa1, 0x0d, 0x4c, 0x75,
+ 0xe4, 0x4a, 0xed, 0xf9, 0x7e, 0xb8, 0x7b, 0x5a, 0x26, 0x78, 0x5f, 0xe3,
+ 0x86, 0x72, 0x64, 0x48, 0x76, 0x51, 0x7a, 0x77, 0x3b, 0xcf, 0xa2, 0x8d,
+ 0x31, 0xec, 0xc1, 0xa7, 0xf9, 0x9a, 0x76, 0x00, 0x7c, 0x17, 0x40, 0x03,
+ 0x12, 0xe8, 0xed, 0xbf, 0x39, 0xe2, 0xdd, 0x6d, 0xdc, 0xe2, 0x34, 0xdf,
+ 0x0d, 0xa6, 0x86, 0x22, 0xca, 0x86, 0x5f, 0x57, 0x25, 0xc6, 0x57, 0x60,
+ 0xc3, 0x06, 0xe9, 0xf0, 0x06, 0xd4, 0xc0, 0xb3, 0xfc, 0x5b, 0xcd, 0xa9,
+ 0xc0, 0x51, 0x6e, 0x10, 0x0a, 0x5a, 0xfd, 0xbf, 0x92, 0xc8, 0x21, 0x0e,
+ 0x83, 0x74, 0xfe, 0x01, 0xec, 0x24, 0x61, 0x9d, 0x9e, 0xb8, 0xb2, 0x04,
+ 0xa7, 0xe9, 0xd6, 0xc7, 0x79, 0x5b, 0xaa, 0xdd, 0x94, 0x5d, 0x26, 0x61,
+ 0x0b, 0xee, 0x66, 0xf4, 0xb2, 0xd1, 0x9b, 0xf0, 0xb4, 0x9b, 0x50, 0x4c,
+ 0x4a, 0x57, 0xbc, 0xfe, 0x7e, 0xca, 0xfe, 0xa8, 0x22, 0x1b, 0x2f, 0x4a,
+ 0x26, 0x32, 0x96, 0xfd, 0x03, 0x02, 0x1b, 0x7c, 0x1d, 0x6d, 0x42, 0x48,
+ 0x2b, 0x11, 0x0d, 0x8f, 0x40, 0xb8, 0x15, 0xf1, 0xdd, 0x06, 0xf7, 0xa0,
+ 0x1f, 0x0f, 0x75, 0xb1, 0x53, 0x73, 0x1f, 0xbf, 0x97, 0xf7, 0xa0, 0xcb,
+ 0x5b, 0x98, 0xb7, 0x50, 0xa7, 0xc5, 0x23, 0x9b, 0x16, 0x0a, 0x2e, 0x03,
+ 0x68, 0x3a, 0x92, 0x75, 0xb8, 0xb0, 0xd8, 0xda, 0x2e, 0x82, 0x61, 0x3f,
+ 0xa0, 0x6e, 0x78, 0xe5, 0x7d, 0x14, 0xe5, 0x1f, 0x7b, 0xec, 0xb5, 0x14,
+ 0xb7, 0xa0, 0x72, 0xdc, 0x1a, 0x23, 0xa4, 0x5b, 0xc5, 0xc2, 0x75, 0x6a,
+ 0x7c, 0x36, 0xef, 0xf0, 0xd1, 0x5a, 0x34, 0x31, 0x0b, 0xae, 0x4c, 0x07,
+ 0xc2, 0xb7, 0xab, 0xd5, 0x67, 0xed, 0x65, 0x5e, 0xa0, 0x7e, 0x16, 0x04,
+ 0xc6, 0x1b, 0x74, 0x0f, 0xa9, 0x35, 0xe8, 0x71, 0x83, 0xca, 0xc3, 0x21,
+ 0x74, 0xf5, 0xee, 0x71, 0xd1, 0x4c, 0xa2, 0x1d, 0xce, 0x16, 0x4b, 0x9b,
+ 0xb0, 0x9f, 0x42, 0x08, 0x49, 0x6a, 0x82, 0x66, 0xe8, 0xb2, 0xce, 0xfd,
+ 0x8e, 0xdb, 0x9e, 0x9e, 0xeb, 0x4b, 0x3d, 0xbb, 0xab, 0x61, 0xe4, 0x0d,
+ 0x87, 0x8e, 0xe9, 0x7b, 0xe8, 0x57, 0x70, 0x8c, 0xab, 0x0c, 0x0f, 0x05,
+ 0x4b, 0xca, 0x6d, 0xe7, 0x94, 0x2b, 0x29, 0x28, 0xfd, 0xfa, 0x11, 0x4c,
+ 0x08, 0x51, 0xce, 0x45, 0x70, 0x87, 0x2b, 0xcf, 0x88, 0x80, 0x87, 0x38,
+ 0x80, 0x5d, 0x2e, 0x8f, 0x47, 0xd8, 0x5e, 0x75, 0x66, 0xa7, 0x86, 0x5e,
+ 0x98, 0xd4, 0x1b, 0x00, 0x11, 0xcf, 0x7b, 0xef, 0x8b, 0x17, 0x93, 0xe0,
+ 0x3a, 0x90, 0x7d, 0x0b, 0x45, 0x34, 0x2a, 0x67, 0xa4, 0x0e, 0xab, 0xc3,
+ 0x3b, 0x27, 0x68, 0x03, 0x4d, 0xcb, 0xd5, 0x87, 0x53, 0x37, 0xe5, 0xcc,
+ 0xc3, 0x73, 0x4a, 0x2c, 0x5f, 0xdc, 0x8d, 0xba, 0x6c, 0x11, 0xa0, 0x35,
+ 0xc6, 0xbe, 0xd9, 0xd6, 0x64, 0x2e, 0x4b, 0x85, 0xbf, 0x50, 0xdd, 0xa6,
+ 0xa0, 0xa4, 0x23, 0xd7, 0x82, 0xb6, 0x65, 0x4e, 0xa8, 0xd4, 0x19, 0xa1,
+ 0xe4, 0xc8, 0x4d, 0x69, 0x2a, 0x41, 0x4f, 0x1e, 0x46, 0xb1, 0xde, 0x64,
+ 0x0b, 0xf8, 0x62, 0xfe, 0x27, 0xc5, 0x2e, 0x31, 0x0f, 0x40, 0xae, 0x64,
+ 0x86, 0x2a, 0x36, 0x7e, 0x03, 0x01, 0x37, 0xf3, 0x36, 0x42, 0x3f, 0xaa,
+ 0x0b, 0xdd, 0xa9, 0x3e, 0x09, 0xe2, 0xe9, 0xea, 0x15, 0x5b, 0x0d, 0x4b,
+ 0xcc, 0x47, 0xa5, 0x24, 0xed, 0x0b, 0x3c, 0xb3, 0x6e, 0xc6, 0x1d, 0x47,
+ 0x39, 0x30, 0xe6, 0xf6, 0xc7, 0xae, 0x6b, 0x25, 0x09, 0xce, 0xf2, 0x2f,
+ 0xaf, 0x4d, 0x32, 0xac, 0x4f, 0xa4, 0xff, 0x39, 0x48, 0xbb, 0xe6, 0xdf,
+ 0x93, 0x41, 0x00, 0x2a, 0x82, 0xd9, 0x81, 0x79, 0xc4, 0x65, 0xf3, 0x62,
+ 0x17, 0x18, 0x37, 0xcf, 0xa0, 0xaa, 0xe5, 0xc6, 0x97, 0x84, 0x14, 0x1c,
+ 0x7e, 0x36, 0x72, 0xe2, 0x35, 0x84, 0x39, 0x43, 0x7b, 0xbf, 0xaf, 0x94,
+ 0x9a, 0xa2, 0xeb, 0xf9, 0xc4, 0x5c, 0x49, 0x5a, 0xef, 0x6b, 0xe6, 0x19,
+ 0x0e, 0xac, 0x08, 0x43, 0x4d, 0x5a, 0x14, 0x7e, 0x27, 0x4a, 0xd1, 0x4a,
+ 0x9b, 0x3f, 0xdc, 0x98, 0x5a, 0xcb, 0x40, 0x90, 0xdf, 0x56, 0xa1, 0x76,
+ 0x12, 0x71, 0xe1, 0x20, 0x5e, 0xf1, 0xaa, 0xd7, 0xba, 0x6c, 0xfb, 0x1d,
+ 0x20, 0xfe, 0xa0, 0x41, 0x65, 0x09, 0x5f, 0x8b, 0xde, 0x20, 0xb7, 0x26,
+ 0xd5, 0xce, 0x83, 0x14, 0x0d, 0x28, 0x36, 0x86, 0xe1, 0x02, 0x86, 0xde,
+ 0xf3, 0xc6, 0x44, 0x10, 0x04, 0x84, 0x9f, 0x18, 0x9b, 0xf1, 0x0a, 0xca,
+ 0x41, 0x53, 0xa9, 0xa9, 0x6b, 0xa5, 0x95, 0x22, 0x1d, 0x17, 0x3b, 0xc0,
+ 0x5f, 0xb7, 0x5e, 0xac, 0x73, 0x4e, 0x76, 0xaf, 0x4c, 0xb4, 0x4f, 0xf6,
+ 0x3f, 0xa1, 0x20, 0x2e, 0xf7, 0xa8, 0x14, 0x0d, 0xc3, 0x50, 0x97, 0x25,
+ 0xe0, 0xc4, 0x5c, 0x3e, 0xe6, 0xbe, 0xe9, 0xa4, 0x1e, 0x1d, 0xdb, 0x06,
+ 0xc1, 0x15, 0xf2, 0x6d, 0xbf, 0x71, 0xf2, 0x0b, 0xd9, 0x75, 0x4b, 0x38,
+ 0xf5, 0xe2, 0x69, 0x0d, 0x93, 0xa5, 0x8e, 0x4c, 0xc5, 0x2a, 0xb6, 0x45,
+ 0x60, 0x77, 0xd6, 0x14, 0x39, 0x5e, 0x70, 0x9e, 0x8d, 0x07, 0x20, 0x1c,
+ 0x05, 0xc9, 0xb0, 0x46, 0xf7, 0x6c, 0x3e, 0xf8, 0xf8, 0x0a, 0xad, 0x0b,
+ 0x22, 0x5e, 0x32, 0xbd, 0x46, 0xbc, 0x06, 0x7b, 0x92, 0x36, 0x5a, 0x2b,
+ 0xac, 0x68, 0x2d, 0x5a, 0xf4, 0xc2, 0x61, 0xe3, 0x9d, 0xf4, 0x5d, 0x59,
+ 0x59, 0x98, 0xb7, 0x5a, 0x73, 0x08, 0xf6, 0x4f, 0x0a, 0x75, 0x04, 0x93,
+ 0xc1, 0xe1, 0x9b, 0xe0, 0xb0, 0x2a, 0xf7, 0xdd, 0x8b, 0xae, 0xf5, 0x55,
+ 0x28, 0x6b, 0x21, 0x9b, 0x02, 0x43, 0xbd, 0x36, 0x4d, 0xa5, 0x17, 0xbb,
+ 0x97, 0xd4, 0x78, 0x1f, 0xe8, 0xd9, 0x98, 0x0e, 0x41, 0x96, 0x52, 0xab,
+ 0xad, 0x91, 0x92, 0xae, 0x62, 0x5c, 0xe7, 0xeb, 0x24, 0x1b, 0xe8, 0x2a,
+ 0xb2, 0xe8, 0xdc, 0x34, 0x7f, 0xe9, 0xa1, 0x4c, 0x4c, 0x13, 0xeb, 0x31,
+ 0x29, 0xc3, 0xc4, 0xf5, 0xb4, 0x50, 0xb1, 0x8b, 0x08, 0xc3, 0x30, 0xf8,
+ 0x40, 0xd8, 0x76, 0xd5, 0x4d, 0xf0, 0xc2, 0xd8, 0x67, 0x75, 0x01, 0x81,
+ 0x2a, 0xe0, 0x6b, 0xc0, 0xf5, 0x30, 0x55, 0xb6, 0xa9, 0x52, 0x19, 0xc4,
+ 0x73, 0x78, 0xc4, 0x9e, 0x13, 0x5f, 0xa7, 0x56, 0xb4, 0x07, 0x2c, 0x92,
+ 0x85, 0x66, 0x5d, 0x00, 0x47, 0x32, 0x3c, 0x8b, 0xbf, 0x86, 0x9e, 0xe2,
+ 0xfd, 0xf1, 0xf0, 0x15, 0x5a, 0x16, 0x44, 0xbc, 0x65, 0x7a, 0x8d, 0x78,
+ 0x0c, 0xf9, 0x94, 0x1d, 0x83, 0x7c, 0xee, 0xc7, 0x71, 0x23, 0x42, 0x2d,
+ 0xb3, 0xe4, 0x68, 0x31, 0xec, 0x17, 0x63, 0x27, 0xe3, 0x52, 0x9d, 0xd0,
+ 0xcd, 0xd8, 0xd8, 0x86, 0xb4, 0x91, 0x8a, 0xa3, 0xcb, 0xa3, 0x76, 0xc7,
+ 0x98, 0xda, 0xd6, 0xb8, 0x34, 0x1c, 0xf6, 0x72, 0x23, 0xd8, 0x1b, 0xbe,
+ 0x2d, 0x05, 0xe1, 0x83, 0x01, 0x74, 0xc7, 0xe3, 0x54, 0x85, 0xec, 0xec,
+ 0xfb, 0x3a, 0xa2, 0xf3, 0x21, 0x7a, 0x0b, 0x68, 0x91, 0x02, 0xd2, 0xa4,
+ 0x40, 0x21, 0xef, 0x4f, 0xe5, 0x3d, 0x6d, 0x6e, 0xfb, 0xba, 0xb1, 0x90,
+ 0x4f, 0x81, 0x07, 0x27, 0x5e, 0xa8, 0xab, 0xa8, 0x87, 0x38, 0x3c, 0xe5,
+ 0x48, 0x29, 0x9e, 0x77, 0x4c, 0xb4, 0x9d, 0x91, 0x2d, 0x8a, 0x0a, 0x84,
+ 0xdd, 0x93, 0x95, 0xdf, 0xd4, 0xa3, 0x8f, 0xb7, 0xaf, 0x07, 0xd3, 0x81,
+ 0xbb, 0x0d, 0x89, 0x42, 0x92, 0x0b, 0x66, 0x39, 0x8b, 0x99, 0x36, 0x61,
+ 0xbb, 0xe1, 0x05, 0xca, 0x68, 0xc8, 0x0f, 0xae, 0x9e, 0x7d, 0x75, 0x7f,
+ 0x24, 0xef, 0xdc, 0x97, 0x8d, 0xb9, 0xa5, 0x7a, 0x3c, 0xc4, 0x49, 0x79,
+ 0x47, 0x47, 0x61, 0x88, 0xaf, 0x96, 0x08, 0x11, 0x22, 0xff, 0xb7, 0x14,
+ 0x12, 0x15, 0x14, 0x26, 0xa3, 0x03, 0x0e, 0xb2, 0xff, 0x57, 0x9e, 0xc0,
+ 0x92, 0x4f, 0x4c, 0x69, 0xd4, 0xfe, 0xc1, 0x46, 0xc4, 0xe8, 0x64, 0x7f,
+ 0x08, 0x38, 0x90, 0x15, 0x8f, 0xc2, 0xc8, 0xa8, 0x50, 0x7f, 0x74, 0x4a,
+ 0xc3, 0x37, 0x52, 0x44, 0x25, 0x78, 0x19, 0x48, 0x00, 0xd1, 0x39, 0x43,
+ 0x3a, 0x14, 0x72, 0x8c, 0x8e, 0xa2, 0xf8, 0x95, 0x1e, 0x56, 0x07, 0xdd,
+ 0xcd, 0x89, 0xde, 0x71, 0xc3, 0x85, 0xc3, 0xcf, 0xe4, 0x6c, 0xf4, 0x43,
+ 0x95, 0x49, 0x27, 0x25, 0x35, 0x1a, 0xb9, 0xf7, 0xc8, 0x20, 0xeb, 0x01,
+ 0xbb, 0x49, 0x8d, 0xf4, 0xc0, 0x32, 0xbe, 0x74, 0x42, 0x07, 0x53, 0xd0,
+ 0xf4, 0x4c, 0x79, 0xa8, 0xb7, 0xf9, 0x09, 0xfd, 0xeb, 0x02, 0x83, 0x26,
+ 0x3b, 0x88, 0x1a, 0x41, 0x70, 0x95, 0x2f, 0x53, 0xc1, 0xc1, 0xa5, 0xbe,
+ 0x23, 0x32, 0x8b, 0x48, 0xb8, 0xff, 0x4c, 0x6b, 0x6e, 0xbf, 0xd7, 0xe0,
+ 0xf1, 0x3a, 0xfd, 0xd2, 0x1e, 0xa2, 0x11, 0x50, 0xa0, 0xfe, 0xd2, 0x3d,
+ 0x20, 0xa6, 0x79, 0xdd, 0x32, 0xd2, 0x76, 0x44, 0xb6, 0x28, 0x2a, 0x13,
+ 0x76, 0x4e, 0x57, 0x92, 0xa5, 0x01, 0x64, 0x30, 0x06, 0xf1, 0xba, 0x62,
+ 0x5a, 0x59, 0xab, 0xf2, 0x15, 0xef, 0x3c, 0x24, 0x96, 0x14, 0x6f, 0xd4,
+ 0x51, 0xee, 0x6d, 0xeb, 0x77, 0xad, 0xba, 0x03, 0xe0, 0xd2, 0x30, 0xbd,
+ 0xbf, 0x06, 0x14, 0xa3, 0xad, 0xd7, 0x97, 0x20, 0x89, 0x63, 0x8f, 0x84,
+ 0x0d, 0x87, 0x6d, 0x5b, 0xdf, 0x0c, 0x2d, 0x86, 0x77, 0x6b, 0x73, 0xd6,
+ 0x34, 0x83, 0xe5, 0x15, 0x88, 0x3e, 0xbc, 0x4d, 0x2c, 0x96, 0xd1, 0x1a,
+ 0x81, 0xf1, 0xb4, 0x6c, 0xaa, 0x52, 0x3a, 0x53, 0x52, 0xc6, 0x73, 0x1b,
+ 0xe6, 0xaa, 0xd5, 0xc8, 0x91, 0xee, 0x72, 0xad, 0x66, 0x25, 0x61, 0xbd,
+ 0xa7, 0x15, 0x46, 0x5d, 0x76, 0x4a, 0x47, 0x9b, 0x03, 0x44, 0xe5, 0x0c,
+ 0xe8, 0x51, 0xca, 0x32, 0x3a, 0x8b, 0xe2, 0x54, 0x79, 0x4d, 0x51, 0x4e,
+ 0xbb, 0x44, 0x2c, 0x30, 0xd1, 0xe6, 0xa1, 0xc9, 0x2c, 0x28, 0xdf, 0xa8,
+ 0xa3, 0xdc, 0xdb, 0xd6, 0xef, 0x5b, 0x74, 0x07, 0xc1, 0xa4, 0x55, 0x37,
+ 0xc6, 0xfc, 0xde, 0xf2, 0x35, 0xb3, 0xf2, 0x3f, 0xe8, 0x0c, 0xbe, 0x60,
+ 0x72, 0x56, 0xde, 0x5f, 0x0d, 0xdd, 0x2e, 0x67, 0x63, 0x31, 0x23, 0xbc,
+ 0xbe, 0x8d, 0x47, 0xdd, 0xa0, 0x38, 0xab, 0x04, 0xd7, 0xb7, 0x07, 0xf9,
+ 0x5d, 0x5e, 0x27, 0xd0, 0x6e, 0xda, 0x01, 0xda, 0x8b, 0x3d, 0xe9, 0x89,
+ 0xe4, 0xbb, 0xeb, 0x3d, 0xd2, 0xb1, 0x16, 0x16, 0xe6, 0x49, 0xb6, 0x28,
+ 0x02, 0xc3, 0xd0, 0x57, 0x17, 0x4f, 0x2a, 0x9b, 0x42, 0x74, 0x1d, 0x38,
+ 0xc4, 0x19, 0xdd, 0xad, 0xcf, 0x58, 0xd2, 0x0f, 0x94, 0x56, 0x20, 0xfa,
+ 0xf1, 0x34, 0xb2, 0x5b, 0x44, 0x6a, 0x07, 0xc6, 0xd1, 0xb2, 0xa9, 0x48,
+ 0xe9, 0x4d, 0x4b, 0x19, 0xcc, 0x6f, 0x9a, 0xab, 0x57, 0x22, 0x47, 0xb9,
+ 0xca, 0xb5, 0x98, 0x88, 0x58, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea,
+ 0x45, 0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e,
+ 0x6a, 0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d,
+ 0xe5, 0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe,
+ 0x79, 0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5,
+ 0x76, 0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4,
+ 0x4f, 0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8,
+ 0xd4, 0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00,
+ 0xd6, 0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27,
+ 0x5e, 0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d,
+ 0xae, 0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6,
+ 0x28, 0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f,
+ 0xdd, 0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc,
+ 0xa8, 0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57,
+ 0x84, 0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf,
+ 0x2e, 0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09,
+ 0x68, 0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37,
+ 0xc7, 0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8,
+ 0x62, 0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b,
+ 0xcf, 0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37,
+ 0x5e, 0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1,
+ 0x79, 0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f,
+ 0x26, 0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3,
+ 0xc3, 0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a,
+ 0x93, 0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8,
+ 0x64, 0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea,
+ 0x25, 0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1,
+ 0x67, 0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4,
+ 0x5a, 0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6,
+ 0xa9, 0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde,
+ 0x55, 0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7,
+ 0x9b, 0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57,
+ 0x63, 0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44,
+ 0xf5, 0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d,
+ 0x4e, 0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d,
+ 0x60, 0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75,
+ 0xe8, 0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda,
+ 0xec, 0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62,
+ 0x84, 0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd,
+ 0xd0, 0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca,
+ 0x8b, 0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78,
+ 0x4d, 0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2,
+ 0xe1, 0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96,
+ 0x8b, 0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c,
+ 0x7b, 0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86,
+ 0x21, 0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc,
+ 0xfe, 0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75,
+ 0xe7, 0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17,
+ 0x98, 0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2,
+ 0x6d, 0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c,
+ 0x36, 0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9,
+ 0x34, 0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86,
+ 0x4f, 0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2,
+ 0x5a, 0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16,
+ 0x7d, 0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45,
+ 0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a,
+ 0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5,
+ 0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79,
+ 0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76,
+ 0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f,
+ 0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4,
+ 0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6,
+ 0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e,
+ 0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae,
+ 0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28,
+ 0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd,
+ 0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8,
+ 0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84,
+ 0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e,
+ 0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68,
+ 0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7,
+ 0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62,
+ 0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf,
+ 0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e,
+ 0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79,
+ 0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26,
+ 0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3,
+ 0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93,
+ 0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64,
+ 0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25,
+ 0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67,
+ 0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a,
+ 0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9,
+ 0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55,
+ 0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b,
+ 0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63,
+ 0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5,
+ 0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e,
+ 0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60,
+ 0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8,
+ 0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec,
+ 0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84,
+ 0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0,
+ 0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b,
+ 0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d,
+ 0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1,
+ 0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b,
+ 0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b,
+ 0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21,
+ 0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe,
+ 0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7,
+ 0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98,
+ 0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d,
+ 0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36,
+ 0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34,
+ 0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f,
+ 0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x5a,
+ 0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16, 0x7d,
+ 0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45, 0xad,
+ 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a, 0x9d,
+ 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5, 0x5b,
+ 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79, 0xb0,
+ 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76, 0x3c,
+ 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f, 0x56,
+ 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4, 0xe3,
+ 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6, 0x02,
+ 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e, 0x89,
+ 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae, 0xc8,
+ 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28, 0x45,
+ 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd, 0x0b,
+ 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8, 0xbc,
+ 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84, 0xdd,
+ 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e, 0x14,
+ 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68, 0xb7,
+ 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7, 0xb2,
+ 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62, 0x13,
+ 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf, 0xef,
+ 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e, 0x78,
+ 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79, 0x83,
+ 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26, 0xd2,
+ 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3, 0x62,
+ 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93, 0x4c,
+ 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64, 0xfa,
+ 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25, 0xa2,
+ 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67, 0xdd,
+ 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a, 0xdc,
+ 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9, 0xdb,
+ 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55, 0xb0,
+ 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b, 0x04,
+ 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63, 0xc3,
+ 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5, 0x63,
+ 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e, 0x34,
+ 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60, 0x2b,
+ 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8, 0x96,
+ 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec, 0x84,
+ 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84, 0x5d,
+ 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0, 0xb2,
+ 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b, 0xcf,
+ 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d, 0xde,
+ 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1, 0x4b,
+ 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b, 0x79,
+ 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b, 0x2c,
+ 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21, 0x39,
+ 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe, 0xf8,
+ 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7, 0x87,
+ 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98, 0x3f,
+ 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d, 0x23,
+ 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36, 0x2b,
+ 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34, 0xc0,
+ 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f, 0xaf,
+ 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x70,
+};
+static_assert(sizeof(kBytesTestReadSymbol14) == kNumBytesTestReadSymbol14, "");
+
+// The kBytesTestReadSymbol16[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][17] = {
+// // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+// // 1/16, 1/16, 1/16, 1/16, 1/16
+// { 32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+// 32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+// 32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+// 32768 - 28672, 32768 - 30720, 0, 0 },
+// // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+// // 2/32, 2/32, 2/32, 2/32, 1/32
+// { 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+// 32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+// 32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+// 32768 - 29696, 32768 - 31744, 0, 0 },
+// // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+// // 2/32, 2/32, 2/32, 2/32, 3/32
+// { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+// 32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+// 32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+// 32768 - 27648, 32768 - 29696, 0, 0 },
+// // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32, 2/32,
+// // 2/32, 2/32, 2/32, 2/32, 1/32
+// { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+// 32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+// 32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+// 32768 - 29696, 32768 - 31744, 0, 0 },
+// };
+// constexpr int kSymbols[32][4] = { { 0, 8, 15, 7 }, //
+// { 1, 9, 14, 6 }, //
+// { 2, 10, 13, 5 }, //
+// { 3, 11, 12, 4 }, //
+// { 4, 12, 11, 3 }, //
+// { 5, 13, 10, 2 }, //
+// { 6, 14, 9, 1 }, //
+// { 7, 15, 8, 0 }, //
+// { 8, 0, 7, 15 }, //
+// { 9, 1, 6, 14 }, //
+// { 10, 2, 5, 13 }, //
+// { 11, 3, 4, 12 }, //
+// { 12, 4, 3, 11 }, //
+// { 13, 5, 2, 10 }, //
+// { 14, 6, 1, 9 }, //
+// { 15, 7, 0, 8 }, //
+// { 0, 0, 15, 13 }, //
+// { 2, 1, 14, 11 }, //
+// { 4, 3, 12, 9 }, //
+// { 6, 5, 10, 7 }, //
+// { 8, 7, 8, 5 }, //
+// { 10, 9, 6, 3 }, //
+// { 12, 11, 4, 1 }, //
+// { 14, 13, 2, 14 }, //
+// { 1, 0, 15, 12 }, //
+// { 3, 2, 13, 10 }, //
+// { 5, 4, 11, 8 }, //
+// { 7, 6, 9, 6 }, //
+// { 9, 8, 7, 4 }, //
+// { 11, 10, 5, 2 }, //
+// { 13, 12, 3, 8 }, //
+// { 15, 14, 1, 7 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 48; ++i) {
+// for (int j = 0; j < 32; ++j) {
+// for (int k = 0; k < 4; ++k) {
+// aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 16);
+// }
+// }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+// if (count++ % 12 == 0) {
+// printf("\n ");
+// } else {
+// printf(" ");
+// }
+// printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol16 = 3120;
+constexpr uint8_t kBytesTestReadSymbol16[] = {
+ 0x09, 0x2c, 0xb8, 0x5a, 0xe4, 0xe6, 0xc6, 0x1f, 0x3e, 0xa7, 0x50, 0xbf,
+ 0x19, 0x26, 0xbf, 0x20, 0xc3, 0xa2, 0x08, 0xdf, 0x44, 0xd9, 0x4d, 0x8c,
+ 0xf7, 0xbf, 0x6b, 0x6d, 0x22, 0x97, 0x8e, 0xd7, 0x93, 0xad, 0x33, 0xe3,
+ 0x7f, 0x5b, 0x71, 0x03, 0x6b, 0x4e, 0xbf, 0xf5, 0x38, 0xbe, 0xba, 0x6c,
+ 0x0d, 0x28, 0xca, 0x74, 0x2d, 0x1d, 0x3f, 0x91, 0xad, 0x7e, 0x98, 0x5c,
+ 0xa7, 0x39, 0x5e, 0x7c, 0x43, 0x2b, 0x88, 0xb2, 0x81, 0x91, 0xad, 0x62,
+ 0x14, 0xc6, 0x0a, 0x81, 0x15, 0x1f, 0x4e, 0xd5, 0xc1, 0x5c, 0x43, 0x35,
+ 0xc3, 0xe6, 0x3d, 0xaa, 0xc3, 0xb5, 0x95, 0x01, 0xbd, 0x2d, 0x21, 0x04,
+ 0x14, 0x79, 0x7a, 0x02, 0x7e, 0xb8, 0x09, 0x20, 0x06, 0x82, 0xc8, 0x6f,
+ 0x29, 0x2c, 0xb2, 0x9b, 0xe2, 0x8d, 0xf5, 0x56, 0xf5, 0x64, 0xf4, 0xd7,
+ 0xfe, 0x24, 0x29, 0xb6, 0x35, 0x16, 0x08, 0x26, 0xc0, 0xf0, 0xfd, 0x33,
+ 0x04, 0x6f, 0x70, 0x85, 0x3a, 0xac, 0x8f, 0xab, 0x48, 0xce, 0x04, 0xc1,
+ 0x0a, 0x4c, 0xb6, 0xaa, 0x83, 0x39, 0xc1, 0xf6, 0x00, 0xb8, 0x56, 0x4e,
+ 0xa2, 0xd1, 0x19, 0x70, 0x6a, 0x2b, 0x86, 0xef, 0xbd, 0x11, 0x27, 0x54,
+ 0x52, 0x01, 0xa2, 0x3f, 0x53, 0x0e, 0x5b, 0x23, 0x3c, 0x90, 0x82, 0xaf,
+ 0x9d, 0x79, 0xb5, 0x5e, 0x7e, 0x2e, 0x6e, 0xad, 0x3d, 0xe9, 0x3a, 0xff,
+ 0xd7, 0x59, 0x40, 0xa3, 0x56, 0xa9, 0x5e, 0x52, 0xda, 0x04, 0x74, 0x09,
+ 0x47, 0x7c, 0x6c, 0x4b, 0xad, 0x00, 0x8b, 0xbc, 0x33, 0x16, 0x49, 0xf6,
+ 0xa5, 0x11, 0x8d, 0xb4, 0xbc, 0x28, 0xea, 0x1b, 0x34, 0x1e, 0xb7, 0x1e,
+ 0xbf, 0x50, 0xe3, 0x60, 0xad, 0x41, 0xe0, 0x19, 0xfa, 0xa4, 0x23, 0x98,
+ 0x48, 0x23, 0xad, 0xfa, 0xdb, 0x3c, 0x0a, 0x15, 0xeb, 0xf5, 0xf1, 0x43,
+ 0xf2, 0xfd, 0x42, 0xf2, 0xd0, 0x3f, 0xa6, 0x3b, 0xc8, 0x81, 0x52, 0xba,
+ 0xcf, 0x2d, 0xff, 0x2c, 0x24, 0x13, 0x62, 0x78, 0x01, 0xd8, 0xcb, 0xfc,
+ 0xda, 0x70, 0x58, 0xad, 0xf1, 0xe6, 0x30, 0x47, 0x39, 0xc6, 0xf0, 0xbc,
+ 0xe4, 0x89, 0x49, 0x46, 0x79, 0xde, 0xac, 0xde, 0xbd, 0x97, 0x18, 0x8f,
+ 0x17, 0x07, 0xc1, 0xaf, 0xf8, 0xc1, 0x45, 0x95, 0x50, 0x36, 0x4d, 0x16,
+ 0x35, 0x92, 0x2b, 0x5a, 0x71, 0x81, 0x59, 0xe5, 0x7f, 0xba, 0x10, 0xc9,
+ 0x49, 0xd4, 0xeb, 0x64, 0x08, 0x54, 0x8b, 0xfa, 0xb3, 0xc8, 0x3a, 0xd7,
+ 0xa6, 0xa9, 0xf2, 0xae, 0x04, 0xf8, 0x55, 0x5c, 0xff, 0x2d, 0x17, 0x53,
+ 0x37, 0xc5, 0x36, 0xd8, 0x42, 0xd7, 0x47, 0xd8, 0x00, 0x99, 0x9c, 0x5d,
+ 0x9f, 0x34, 0xc2, 0x09, 0x6b, 0x1a, 0xf3, 0x2f, 0xb0, 0xf8, 0x49, 0x54,
+ 0x9d, 0x4b, 0xb8, 0xcf, 0xc5, 0x3b, 0x7f, 0x49, 0x9b, 0x40, 0xa9, 0xd3,
+ 0x96, 0xe1, 0x6b, 0x87, 0x2d, 0x50, 0x76, 0x15, 0xd9, 0x9f, 0x87, 0x4f,
+ 0x13, 0x26, 0xf2, 0xf8, 0xae, 0xd4, 0x63, 0x02, 0x0c, 0xcb, 0xe5, 0x63,
+ 0x1c, 0x73, 0xdf, 0x57, 0x55, 0x16, 0x57, 0x3b, 0xfb, 0x9a, 0x06, 0x70,
+ 0xfc, 0x9f, 0x29, 0x16, 0xec, 0x63, 0x34, 0x6f, 0x40, 0x1f, 0x54, 0x2a,
+ 0xe7, 0x4a, 0x6f, 0xde, 0x86, 0xeb, 0x8c, 0x91, 0x3e, 0xfc, 0x6a, 0x48,
+ 0xd1, 0x51, 0x33, 0xd7, 0xe1, 0x9d, 0xf8, 0x71, 0x21, 0x7b, 0x02, 0x38,
+ 0x6a, 0xef, 0x30, 0x70, 0x38, 0x01, 0xc3, 0xef, 0x5d, 0x4f, 0xd3, 0x37,
+ 0x2d, 0xe0, 0x4f, 0x4b, 0x72, 0xbc, 0xde, 0x9f, 0x32, 0x97, 0xe2, 0x55,
+ 0x5e, 0x59, 0x5d, 0xa2, 0x9f, 0x5a, 0x04, 0x7c, 0x13, 0xe1, 0x35, 0x62,
+ 0x4a, 0x10, 0x24, 0x55, 0x63, 0xb8, 0x8f, 0x66, 0xbc, 0x04, 0x08, 0x4e,
+ 0xcc, 0xdc, 0x1f, 0x88, 0xc5, 0xcf, 0x8a, 0x7e, 0x24, 0x3e, 0x6f, 0x58,
+ 0xcb, 0x44, 0x3c, 0x18, 0x64, 0xd9, 0x84, 0xa8, 0x1c, 0x0b, 0x20, 0xf4,
+ 0x8b, 0x8b, 0x4b, 0xf8, 0x39, 0x8b, 0x01, 0x3a, 0x0b, 0x27, 0x67, 0xf8,
+ 0x0f, 0xbd, 0xb3, 0x32, 0xce, 0xef, 0xbc, 0x8c, 0xa3, 0x31, 0xee, 0x0b,
+ 0xdb, 0xc7, 0xc3, 0x43, 0x80, 0xe4, 0x7c, 0x9b, 0x89, 0xa4, 0x6b, 0x23,
+ 0x2f, 0xa8, 0x28, 0xe0, 0x55, 0x30, 0x6e, 0xe7, 0xc9, 0x50, 0x1d, 0xbf,
+ 0x67, 0xc8, 0x74, 0x58, 0x0f, 0xdb, 0xa6, 0x1f, 0xa6, 0xfd, 0xf0, 0x75,
+ 0xea, 0x62, 0xd5, 0x44, 0xa2, 0x7e, 0xed, 0x63, 0xba, 0x7c, 0x5d, 0xb7,
+ 0x16, 0x84, 0x30, 0x5d, 0xc2, 0xd3, 0x39, 0x61, 0x60, 0x0a, 0xb9, 0x34,
+ 0x5e, 0x54, 0xf4, 0x34, 0x77, 0x22, 0x05, 0x41, 0x6b, 0x6a, 0x13, 0xc3,
+ 0x10, 0x03, 0x8a, 0x78, 0xd2, 0x81, 0xac, 0x49, 0x31, 0xc8, 0xee, 0x15,
+ 0xc3, 0x42, 0x3b, 0x00, 0xf6, 0x05, 0x92, 0x82, 0x6e, 0x73, 0xb4, 0xfa,
+ 0xab, 0xe0, 0x2e, 0xe9, 0x5d, 0x89, 0x43, 0x0c, 0x4d, 0x88, 0x0c, 0xf1,
+ 0xa4, 0x19, 0x59, 0xa0, 0x69, 0x0c, 0xfc, 0xf9, 0x9a, 0xbc, 0x3b, 0x2e,
+ 0x3b, 0x29, 0xf8, 0xd7, 0x79, 0x11, 0xb2, 0x66, 0x26, 0x57, 0x34, 0x06,
+ 0xb8, 0x36, 0x41, 0xca, 0x01, 0x10, 0xca, 0x06, 0xee, 0xb6, 0xf7, 0x1d,
+ 0x0d, 0x88, 0xab, 0x07, 0xbe, 0x06, 0x8c, 0x1c, 0xa2, 0x76, 0x5e, 0xdb,
+ 0x60, 0xa4, 0x43, 0x17, 0x31, 0xc3, 0x4b, 0x0a, 0x01, 0x80, 0xa7, 0xf6,
+ 0xe6, 0x78, 0x64, 0x85, 0xb0, 0x8a, 0x28, 0x34, 0x82, 0x98, 0x29, 0x3f,
+ 0xde, 0x07, 0x9a, 0x80, 0xcf, 0xe3, 0x6f, 0x23, 0x57, 0x79, 0x11, 0xb2,
+ 0x61, 0x6d, 0x98, 0x26, 0xeb, 0x3b, 0xbf, 0xaa, 0x98, 0x62, 0xbb, 0xfd,
+ 0x21, 0x76, 0xe5, 0xc5, 0xe0, 0x09, 0x21, 0x65, 0x72, 0x94, 0xd3, 0x8a,
+ 0xcd, 0xfb, 0xec, 0x6e, 0x57, 0xd4, 0x2a, 0x92, 0xd1, 0xe9, 0x16, 0x46,
+ 0xa2, 0x38, 0xae, 0x4b, 0x7e, 0xa7, 0x0c, 0x26, 0x9d, 0x96, 0xd7, 0x49,
+ 0xa7, 0x02, 0x2b, 0x22, 0x9a, 0x39, 0x38, 0x11, 0xb8, 0xb3, 0xd5, 0x09,
+ 0xf9, 0x70, 0xb4, 0x1c, 0x4e, 0xe3, 0xba, 0xa0, 0x78, 0x76, 0x6d, 0xc4,
+ 0xab, 0x96, 0x3e, 0x98, 0x04, 0x4e, 0x50, 0x20, 0xd9, 0xfa, 0xea, 0xe2,
+ 0x99, 0x50, 0x84, 0x20, 0x18, 0x69, 0xbb, 0x6e, 0x41, 0x9d, 0x18, 0x71,
+ 0x15, 0x19, 0xd2, 0xf2, 0xa5, 0x69, 0x54, 0x8e, 0x60, 0x75, 0xd4, 0xe7,
+ 0xdb, 0xe1, 0x43, 0xfd, 0x2e, 0x21, 0x4f, 0xff, 0x98, 0x8b, 0x08, 0x74,
+ 0xca, 0x29, 0x7e, 0x3f, 0x2f, 0x6a, 0xf9, 0xe6, 0x49, 0x1d, 0xc6, 0x0b,
+ 0x76, 0xc9, 0x22, 0xc3, 0x4f, 0xaf, 0xa8, 0xf9, 0xd6, 0x9c, 0x9a, 0x64,
+ 0xec, 0xb3, 0x2c, 0x0f, 0x3e, 0x93, 0xc4, 0xb6, 0xd7, 0x36, 0x28, 0x04,
+ 0xe5, 0x81, 0x48, 0x14, 0x9f, 0x4e, 0xc5, 0x9b, 0xd7, 0xc0, 0x0e, 0x35,
+ 0xab, 0x49, 0xd3, 0x84, 0x9f, 0x5c, 0x93, 0x94, 0xa6, 0xd2, 0xb5, 0x83,
+ 0x9d, 0x38, 0x0f, 0x85, 0x04, 0xa3, 0xb7, 0x23, 0x20, 0x93, 0x85, 0x48,
+ 0x14, 0x0c, 0x22, 0x80, 0x92, 0x6c, 0xca, 0x3c, 0xc7, 0xfc, 0xa9, 0x88,
+ 0x62, 0xbc, 0x2a, 0x91, 0x08, 0x5b, 0xb4, 0x60, 0xd1, 0x0f, 0x3c, 0x33,
+ 0xc6, 0xe1, 0xf7, 0xca, 0xf7, 0xf9, 0xa1, 0x9b, 0xfa, 0xf7, 0x34, 0xe0,
+ 0x54, 0xac, 0x53, 0x42, 0x30, 0x76, 0xc8, 0xc2, 0xcd, 0x61, 0x49, 0x87,
+ 0x9c, 0x47, 0xf5, 0x98, 0xb5, 0x41, 0xf0, 0xad, 0xdb, 0x37, 0x06, 0xb8,
+ 0x54, 0xa5, 0x26, 0x11, 0x4b, 0x18, 0xbb, 0xa4, 0xfb, 0x24, 0xd3, 0x14,
+ 0x31, 0xfb, 0x56, 0x18, 0xd8, 0xc2, 0xd0, 0xd2, 0xab, 0xde, 0xdf, 0xa9,
+ 0xdf, 0x9e, 0xa6, 0x56, 0x0d, 0x9f, 0xe4, 0x19, 0x15, 0x58, 0x18, 0xc6,
+ 0x5e, 0x47, 0x05, 0x3a, 0x0e, 0x73, 0x68, 0x81, 0x39, 0x8c, 0x51, 0x1d,
+ 0x04, 0x4e, 0x18, 0x54, 0xa5, 0x3e, 0x13, 0x4a, 0x15, 0xc2, 0x43, 0x90,
+ 0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xbc, 0x69, 0x3e, 0x11, 0x46,
+ 0x9d, 0xa4, 0xd3, 0x15, 0x80, 0xec, 0xe8, 0x31, 0x4f, 0x5a, 0x2a, 0x15,
+ 0x3e, 0x7e, 0x7a, 0x44, 0x0e, 0x4a, 0xac, 0x9b, 0x46, 0x2f, 0x86, 0xf9,
+ 0xea, 0x59, 0x4f, 0x15, 0xa0, 0x4b, 0xd1, 0xaa, 0xd8, 0x3a, 0x83, 0xb6,
+ 0x25, 0x82, 0xb0, 0x44, 0x4a, 0x98, 0xbd, 0x10, 0xa2, 0xb0, 0x95, 0x02,
+ 0xfa, 0x1f, 0xd3, 0x54, 0x1c, 0x0a, 0xb1, 0x31, 0x28, 0xec, 0x4c, 0xd2,
+ 0x0c, 0xb9, 0xb0, 0xf4, 0x7a, 0x89, 0x63, 0x3c, 0x5f, 0xcf, 0x3c, 0xe8,
+ 0xba, 0x21, 0x66, 0x20, 0x01, 0xcb, 0x1b, 0xc6, 0xf9, 0x54, 0x0f, 0xda,
+ 0x4a, 0xcc, 0x81, 0x7b, 0x41, 0x81, 0xc0, 0x1f, 0xea, 0x9a, 0x9b, 0x96,
+ 0x0d, 0x47, 0xdd, 0x16, 0x52, 0x5c, 0xaf, 0xae, 0x82, 0x3d, 0x18, 0x60,
+ 0xfa, 0x34, 0xc2, 0x57, 0x2d, 0xc4, 0x2b, 0x2e, 0x41, 0xfe, 0xe7, 0x95,
+ 0xcd, 0x1f, 0xbe, 0x88, 0x31, 0xc1, 0x07, 0x2c, 0xd3, 0xb1, 0xbb, 0xeb,
+ 0x1d, 0xa3, 0x03, 0x1e, 0x70, 0xcc, 0x84, 0xe0, 0x65, 0x41, 0x0f, 0xf1,
+ 0x7c, 0x95, 0x4b, 0x41, 0x43, 0x62, 0xad, 0x5d, 0xff, 0x4f, 0x92, 0xc8,
+ 0xaa, 0x21, 0x23, 0xba, 0xa9, 0x90, 0xb5, 0xae, 0xc0, 0x1f, 0xae, 0x43,
+ 0xf1, 0x79, 0x14, 0x30, 0x16, 0x1d, 0x2a, 0x6c, 0xd1, 0xd8, 0xb3, 0x38,
+ 0x25, 0xd1, 0x66, 0xa5, 0x89, 0xc0, 0x8d, 0xc5, 0xa0, 0x6a, 0x7c, 0x64,
+ 0xf8, 0x45, 0x1a, 0x76, 0x93, 0x4c, 0x56, 0x03, 0xb3, 0xa0, 0xc5, 0x40,
+ 0xbc, 0x84, 0x98, 0x8d, 0xa4, 0xfe, 0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85,
+ 0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b, 0x20, 0x02, 0x70, 0xbf, 0x72, 0x29,
+ 0x0c, 0x0a, 0x9c, 0xac, 0x9c, 0x4d, 0xfa, 0x02, 0x5e, 0xe9, 0xe3, 0x52,
+ 0x84, 0x54, 0x1f, 0xb7, 0xea, 0xb1, 0xc4, 0x2f, 0x69, 0xd1, 0x33, 0xc6,
+ 0xb3, 0xee, 0xb0, 0x35, 0x1f, 0x19, 0x68, 0x2d, 0xef, 0xc1, 0xd3, 0x1c,
+ 0xa8, 0x84, 0x54, 0x3c, 0x21, 0xed, 0x78, 0x35, 0x3f, 0x82, 0xb2, 0xa8,
+ 0xe4, 0x25, 0x71, 0xfc, 0x1e, 0x1d, 0x36, 0xf4, 0xf4, 0x0f, 0x6f, 0x5b,
+ 0xd9, 0x21, 0x13, 0x3a, 0x3d, 0x17, 0x45, 0x31, 0x78, 0x97, 0x99, 0x15,
+ 0x87, 0xa9, 0xa6, 0x36, 0xf0, 0x20, 0xfa, 0xd5, 0x10, 0x01, 0x91, 0xa0,
+ 0x4f, 0x28, 0x6a, 0x13, 0x04, 0xff, 0x97, 0x96, 0xf1, 0xfc, 0x1c, 0xc8,
+ 0xcd, 0xe4, 0xbd, 0xe5, 0x40, 0x9a, 0x37, 0xc2, 0x01, 0x11, 0x2a, 0xc0,
+ 0x0e, 0x58, 0x69, 0x29, 0xd0, 0x72, 0x26, 0x7c, 0x23, 0xec, 0x58, 0xfe,
+ 0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f, 0x79, 0xb1, 0xfa, 0xac, 0x59, 0xe0,
+ 0x78, 0x1c, 0xb4, 0x29, 0xee, 0x00, 0x39, 0x11, 0x0a, 0x2a, 0xb9, 0x98,
+ 0x4e, 0xbf, 0x75, 0x9e, 0xe8, 0xbb, 0x4b, 0xe0, 0x6b, 0xab, 0x5b, 0x2f,
+ 0x2d, 0xe3, 0xf8, 0x39, 0x91, 0x9b, 0xc9, 0x7b, 0xca, 0x81, 0x34, 0x6f,
+ 0x84, 0x02, 0x22, 0x55, 0x80, 0x1c, 0xb0, 0xd2, 0x53, 0xa0, 0xe4, 0x4c,
+ 0xf8, 0x47, 0xd8, 0xb1, 0xfd, 0x7a, 0x2b, 0x2f, 0xd0, 0x53, 0x3e, 0xf3,
+ 0x63, 0xf5, 0x58, 0xb3, 0xc0, 0xf0, 0x39, 0x00, 0x08, 0x97, 0x4b, 0xe2,
+ 0x46, 0x04, 0xa2, 0x39, 0x9c, 0xf2, 0x57, 0x17, 0x4a, 0xdd, 0x9f, 0x5e,
+ 0xb1, 0x8b, 0x6b, 0x5d, 0x6e, 0x3e, 0x85, 0x34, 0x04, 0x96, 0x56, 0xe7,
+ 0x4f, 0x6f, 0xd0, 0x31, 0xe7, 0x0c, 0xc8, 0x88, 0xdd, 0x5b, 0x14, 0x00,
+ 0x60, 0x2a, 0x06, 0x18, 0xcd, 0x7f, 0xc9, 0xee, 0xd2, 0xd0, 0x8c, 0xc0,
+ 0xed, 0x8f, 0x4a, 0x3e, 0x83, 0x52, 0x2e, 0x4a, 0xe9, 0xfa, 0x1f, 0x1a,
+ 0xd5, 0xc0, 0x59, 0x4c, 0x8a, 0x2a, 0xab, 0x40, 0x2f, 0x84, 0xd2, 0x85,
+ 0x70, 0x90, 0x96, 0xf3, 0x84, 0x6f, 0x1e, 0x81, 0x8c, 0x80, 0x03, 0x03,
+ 0x2d, 0x36, 0x2e, 0x60, 0x79, 0x13, 0x63, 0x7f, 0xe7, 0xe3, 0x4a, 0x96,
+ 0x08, 0xd8, 0x35, 0x15, 0x46, 0x8a, 0xe0, 0xb8, 0xc4, 0x7a, 0x28, 0x88,
+ 0x52, 0xa8, 0x9a, 0xdd, 0x31, 0x65, 0xb2, 0x00, 0x24, 0xd9, 0xf4, 0x07,
+ 0xea, 0xab, 0x7c, 0xe8, 0xa2, 0xea, 0xa7, 0x23, 0xd1, 0x93, 0x9e, 0xe7,
+ 0x48, 0x34, 0x89, 0xf5, 0xb4, 0x45, 0x5e, 0xfa, 0xa6, 0xee, 0x32, 0x75,
+ 0x8c, 0x56, 0x08, 0xcc, 0xeb, 0x5b, 0x05, 0xc2, 0x1d, 0x62, 0xa8, 0x5d,
+ 0xaa, 0x50, 0xc2, 0x85, 0x85, 0x25, 0xb3, 0x5f, 0x60, 0xe7, 0x90, 0x1b,
+ 0xa8, 0xb7, 0xf6, 0x83, 0x11, 0x07, 0x1f, 0xfc, 0xce, 0x58, 0x22, 0x8a,
+ 0x3d, 0xa9, 0x8c, 0x18, 0x66, 0xa8, 0x32, 0x78, 0xa0, 0x16, 0x8a, 0xa2,
+ 0x5d, 0x2f, 0x89, 0x18, 0x12, 0x88, 0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b,
+ 0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad, 0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12,
+ 0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40, 0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75,
+ 0x6c, 0x50, 0x01, 0x80, 0xa8, 0x18, 0x63, 0x35, 0xff, 0x27, 0xbb, 0x4b,
+ 0x42, 0x33, 0x03, 0xb6, 0x3d, 0x28, 0xfa, 0x0d, 0x48, 0xb9, 0x2b, 0xa7,
+ 0xe8, 0x7c, 0x6b, 0x57, 0x01, 0x65, 0x32, 0x28, 0xaa, 0xad, 0x00, 0xbe,
+ 0x13, 0x4a, 0x15, 0xc2, 0x42, 0x5b, 0xce, 0x11, 0xbc, 0x7a, 0x06, 0x32,
+ 0x00, 0x0c, 0x0c, 0xb4, 0xd8, 0xb9, 0x81, 0xe4, 0x4d, 0x8d, 0xff, 0x9f,
+ 0x8d, 0x2a, 0x58, 0x23, 0x60, 0xd4, 0x55, 0x1a, 0x2b, 0x82, 0xe3, 0x11,
+ 0xe8, 0xa2, 0x21, 0x4a, 0xa2, 0x6b, 0x74, 0xc5, 0x96, 0xc8, 0x00, 0x93,
+ 0x67, 0xd0, 0x1f, 0xaa, 0xad, 0xf3, 0xa2, 0x8b, 0xaa, 0x9c, 0x8f, 0x46,
+ 0x4e, 0x7b, 0x9d, 0x20, 0xd2, 0x27, 0xd6, 0xd1, 0x15, 0x7b, 0xea, 0x9b,
+ 0xb8, 0xc9, 0xd6, 0x31, 0x58, 0x23, 0x33, 0xad, 0x6c, 0x17, 0x08, 0x75,
+ 0x8a, 0xa1, 0x76, 0xa9, 0x43, 0x0a, 0x16, 0x14, 0x96, 0xcd, 0x7d, 0x83,
+ 0x9e, 0x40, 0x6e, 0xa2, 0xdf, 0xda, 0x0c, 0x44, 0x1c, 0x7f, 0xf3, 0x39,
+ 0x60, 0x8a, 0x28, 0xf6, 0xa6, 0x30, 0x61, 0x9a, 0xa0, 0xc9, 0xe2, 0x80,
+ 0x5a, 0x2a, 0x89, 0x74, 0xbe, 0x24, 0x60, 0x4a, 0x23, 0x99, 0xcf, 0x25,
+ 0x71, 0x74, 0xad, 0xd9, 0xf5, 0xeb, 0x18, 0xb6, 0xb5, 0xd6, 0xe3, 0xe8,
+ 0x53, 0x40, 0x49, 0x65, 0x6e, 0x74, 0xf6, 0xfd, 0x03, 0x1e, 0x70, 0xcc,
+ 0x88, 0x8d, 0xd5, 0xb1, 0x40, 0x06, 0x02, 0xa0, 0x61, 0x8c, 0xd7, 0xfc,
+ 0x9e, 0xed, 0x2d, 0x08, 0xcc, 0x0e, 0xd8, 0xf4, 0xa3, 0xe9, 0x41, 0x30,
+ 0x05, 0xc8, 0xbd, 0x3c, 0xa4, 0xb7, 0x09, 0x6f, 0x9c, 0xc8, 0xa2, 0xaa,
+ 0xb4, 0x02, 0xf8, 0x4d, 0x28, 0x57, 0x09, 0x09, 0x6f, 0x38, 0x46, 0xf1,
+ 0xe8, 0x18, 0xc8, 0x00, 0x30, 0x32, 0xd3, 0x62, 0xe6, 0x07, 0x91, 0x36,
+ 0x37, 0xfe, 0x7e, 0x34, 0xa9, 0x60, 0x8d, 0x83, 0x51, 0x54, 0x68, 0xae,
+ 0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85, 0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b,
+ 0x20, 0x02, 0x4f, 0xc0, 0x04, 0x8e, 0x38, 0xde, 0xd8, 0x95, 0xfc, 0x97,
+ 0xd9, 0xd2, 0x15, 0xdb, 0x1a, 0xcc, 0x69, 0x02, 0xad, 0x4a, 0x5a, 0x70,
+ 0x8b, 0xbf, 0xfc, 0x35, 0x6d, 0x3a, 0x0f, 0xc9, 0xea, 0x78, 0x1a, 0xd1,
+ 0xcb, 0xb7, 0xaa, 0xb8, 0xf2, 0x44, 0xdf, 0xb3, 0xfe, 0x24, 0x83, 0xb9,
+ 0x53, 0x94, 0x7e, 0xa5, 0xc5, 0x3f, 0xa2, 0x31, 0x3d, 0xdc, 0x0b, 0xb1,
+ 0x24, 0x2f, 0x99, 0x4a, 0xd4, 0x0e, 0x6b, 0x3a, 0x34, 0x31, 0xc5, 0x87,
+ 0x68, 0xbd, 0x61, 0xbd, 0xe2, 0xa0, 0xdb, 0x9a, 0x33, 0xfd, 0xc5, 0x10,
+ 0x3f, 0xfb, 0xeb, 0xbd, 0x29, 0x03, 0x85, 0x8d, 0x08, 0x7b, 0xb6, 0xf7,
+ 0xf0, 0xf5, 0x13, 0x69, 0x3e, 0x35, 0x68, 0x58, 0x50, 0xdb, 0x50, 0x13,
+ 0x02, 0x3e, 0x81, 0x4b, 0x44, 0x6c, 0x75, 0x02, 0xe6, 0x90, 0x75, 0x6c,
+ 0xc6, 0x7c, 0x23, 0xec, 0x58, 0xfe, 0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f,
+ 0x80, 0x54, 0x65, 0xb8, 0x3c, 0x40, 0xe6, 0xdb, 0xbe, 0x51, 0x73, 0xe5,
+ 0xf1, 0x23, 0x02, 0x51, 0x1c, 0xce, 0x79, 0x2b, 0x8b, 0xa5, 0x6e, 0xcf,
+ 0xaf, 0x58, 0xc5, 0xb5, 0xae, 0xb7, 0x1f, 0x42, 0x9a, 0x02, 0x4b, 0x2b,
+ 0x73, 0xa7, 0xb7, 0xe8, 0x18, 0xf3, 0x86, 0x64, 0x44, 0x6e, 0xad, 0x8a,
+ 0x00, 0x30, 0x15, 0x03, 0x0c, 0x66, 0xbf, 0xe4, 0xf7, 0x69, 0x68, 0x46,
+ 0x60, 0x76, 0xc7, 0xa5, 0x1f, 0x4a, 0x09, 0x80, 0x2e, 0x45, 0xe9, 0xe5,
+ 0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2, 0x69,
+ 0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40, 0x01,
+ 0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1, 0xa5,
+ 0x4b, 0x04, 0x6c, 0x1a, 0x8a, 0xa3, 0x45, 0x70, 0x5c, 0x62, 0x3d, 0x14,
+ 0x44, 0x29, 0x54, 0x4d, 0x6e, 0x98, 0xb2, 0xd9, 0x00, 0x12, 0x7e, 0x00,
+ 0x24, 0x71, 0xc6, 0xf6, 0xc4, 0xaf, 0xe4, 0xbe, 0xce, 0x90, 0xae, 0xd8,
+ 0xd6, 0x63, 0x48, 0x15, 0x6a, 0x52, 0xd3, 0x84, 0x5d, 0xff, 0xe1, 0xab,
+ 0x69, 0xd0, 0x7e, 0x4f, 0x53, 0xc0, 0xd6, 0x8e, 0x5d, 0xbd, 0x55, 0xc7,
+ 0x92, 0x26, 0xfd, 0x9f, 0xf1, 0x24, 0x1d, 0xca, 0x9c, 0xa3, 0xf5, 0x2e,
+ 0x29, 0xfd, 0x11, 0x89, 0xee, 0xe0, 0x5d, 0x89, 0x21, 0x7c, 0xca, 0x56,
+ 0xa0, 0x73, 0x59, 0xd1, 0xa1, 0x8e, 0x2c, 0x3b, 0x45, 0xeb, 0x0d, 0xef,
+ 0x15, 0x06, 0xdc, 0xd1, 0x9f, 0xee, 0x28, 0x81, 0xff, 0xdf, 0x5d, 0xe9,
+ 0x48, 0x1c, 0x2c, 0x68, 0x43, 0xdd, 0xb7, 0xbf, 0x87, 0xa8, 0x9b, 0x49,
+ 0xf1, 0xab, 0x42, 0xc2, 0x86, 0xda, 0x80, 0x98, 0x11, 0xf4, 0x0a, 0x5a,
+ 0x23, 0x63, 0xa8, 0x17, 0x34, 0x83, 0xab, 0x66, 0x33, 0xe1, 0x1f, 0x62,
+ 0xc7, 0xf5, 0xe8, 0xac, 0xbf, 0x41, 0x4c, 0xfc, 0x02, 0xa3, 0x2d, 0xc1,
+ 0xe2, 0x07, 0x36, 0xdd, 0xf2, 0x8b, 0x9f, 0x2f, 0x89, 0x18, 0x12, 0x88,
+ 0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b, 0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad,
+ 0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12, 0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40,
+ 0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75, 0x6c, 0x50, 0x01, 0x80, 0xa8, 0x83,
+ 0x06, 0xd4, 0xd6, 0x8d, 0x36, 0x78, 0xf9, 0x03, 0x23, 0xdb, 0x17, 0x90,
+ 0x52, 0x0c, 0x5f, 0x1b, 0xe6, 0x44, 0x79, 0x52, 0xc5, 0x50, 0x17, 0x81,
+ 0xf3, 0x1b, 0x88, 0xba, 0xfd, 0xbd, 0xa5, 0x51, 0x65, 0x6d, 0x33, 0x96,
+ 0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xb9, 0xd0, 0x45, 0x61, 0xaf,
+ 0xf9, 0xb7, 0x38, 0x55, 0x4f, 0xe9, 0x85, 0x1d, 0x4c, 0x0e, 0x40, 0x77,
+ 0x03, 0xbc, 0x09, 0xd0, 0x37, 0xe3, 0xde, 0xf1, 0x0c, 0xa6, 0xc8, 0xd5,
+ 0x63, 0x01, 0xfd, 0xe7, 0xc0, 0x9a, 0xe0, 0x98, 0x02, 0xe4, 0x5e, 0x9e,
+ 0x52, 0x5b, 0x84, 0xb7, 0xce, 0x64, 0x51, 0x55, 0x5a, 0x01, 0x7c, 0x26,
+ 0x94, 0x2b, 0x84, 0x84, 0xb7, 0x9c, 0x23, 0x78, 0xf4, 0x0c, 0x64, 0x00,
+ 0x18, 0x19, 0x69, 0xb1, 0x73, 0x03, 0xc8, 0x9b, 0x1b, 0xff, 0x3f, 0x1a,
+ 0x54, 0xb0, 0x46, 0xc1, 0xa8, 0xaa, 0x34, 0x57, 0x07, 0x13, 0xd3, 0x43,
+ 0xb1, 0xaa, 0x4b, 0xc4, 0xcb, 0x5a, 0x9b, 0xa2, 0x23, 0x98, 0xa2, 0xd3,
+ 0x2b, 0x8c, 0x7b, 0xf8, 0xc7, 0xaa, 0xf6, 0xcc, 0xb8, 0xfc, 0xb5, 0x77,
+ 0xce, 0xff, 0x9d, 0x0e, 0xdb, 0x2b, 0x03, 0xc7, 0x42, 0x86, 0xf1, 0xcb,
+ 0xa2, 0xa7, 0x85, 0x77, 0x58, 0x1a, 0x8f, 0x8c, 0xb4, 0x16, 0xf7, 0xe0,
+ 0xe9, 0x8e, 0x54, 0x42, 0x2a, 0x1e, 0x10, 0xf6, 0xbc, 0x1a, 0x9f, 0xa1,
+ 0xcb, 0xff, 0x13, 0x06, 0x88, 0x6b, 0xb1, 0xeb, 0x37, 0x26, 0xe5, 0x34,
+ 0x0d, 0x73, 0x87, 0x91, 0x60, 0x6c, 0xd7, 0x2d, 0xc3, 0x5f, 0x40, 0x68,
+ 0x45, 0x07, 0x6e, 0x62, 0xa9, 0xe3, 0x52, 0x75, 0xef, 0x14, 0xf5, 0x89,
+ 0x0a, 0x3a, 0x57, 0x8b, 0xac, 0xbe, 0x86, 0x67, 0xd1, 0xd8, 0x35, 0xe5,
+ 0xe7, 0x75, 0xb8, 0xf8, 0x28, 0x6d, 0xa8, 0x09, 0x81, 0x1f, 0x40, 0xa5,
+ 0xa2, 0x36, 0x3a, 0x81, 0x73, 0x48, 0x3e, 0x8c, 0x9d, 0x1f, 0x78, 0xc5,
+ 0x92, 0x36, 0x1a, 0xae, 0xdf, 0xda, 0xf8, 0x0a, 0x7e, 0x69, 0xcb, 0xaf,
+ 0x74, 0x59, 0x49, 0x72, 0xa7, 0x97, 0x1c, 0x8c, 0xf0, 0x16, 0x01, 0x4a,
+ 0xcc, 0x1a, 0xa1, 0x24, 0x83, 0x7b, 0x34, 0x65, 0x20, 0x51, 0x11, 0xae,
+ 0x5d, 0xa7, 0x68, 0x9c, 0xec, 0x29, 0x27, 0xfc, 0x07, 0x49, 0xb4, 0x9b,
+ 0x65, 0xb2, 0x51, 0x97, 0xae, 0xa5, 0x8a, 0x70, 0xe5, 0x53, 0xd3, 0xa2,
+ 0x34, 0x35, 0xbd, 0xbf, 0x75, 0x64, 0xda, 0x88, 0x8c, 0xe9, 0xc3, 0x9a,
+ 0x32, 0xf0, 0x5a, 0x96, 0xae, 0xef, 0x9a, 0xdd, 0x84, 0xc2, 0x97, 0x22,
+ 0x2f, 0x06, 0x83, 0x32, 0x10, 0xff, 0x1d, 0x61, 0x60, 0x5f, 0x69, 0x10,
+ 0x5d, 0x23, 0xc6, 0xf3, 0x3f, 0xa9, 0x53, 0xfe, 0xd0, 0x3e, 0x90, 0xe6,
+ 0x54, 0x48, 0xab, 0x01, 0x76, 0x75, 0x88, 0x7b, 0x4e, 0xc6, 0xd0, 0x9b,
+ 0x7a, 0xcd, 0x87, 0x36, 0x3e, 0x7e, 0x3d, 0xef, 0x10, 0xca, 0x6c, 0x8d,
+ 0x56, 0x30, 0x1f, 0xde, 0x7c, 0x09, 0xae, 0x09, 0x80, 0x2e, 0x45, 0xe9,
+ 0xe5, 0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2,
+ 0x69, 0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40,
+ 0x01, 0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1,
+ 0xa5, 0x5c, 0xdc, 0x1e, 0x69, 0xfc, 0xf1, 0xd8, 0x5d, 0xda, 0x13, 0x5b,
+ 0xbc, 0x1f, 0x41, 0x4a, 0xde, 0x44, 0x3c, 0x5e, 0xbd, 0x46, 0xb7, 0xad,
+ 0x32, 0xb8, 0xc7, 0xbf, 0x8c, 0x7a, 0xaf, 0x6c, 0xcb, 0x8f, 0xcb, 0x57,
+ 0x7c, 0xef, 0xf9, 0xd0, 0xed, 0xb2, 0xb0, 0x3c, 0x74, 0x28, 0x6f, 0x1c,
+ 0xba, 0x2a, 0x78, 0x57, 0x75, 0x81, 0xa8, 0xf8, 0xcb, 0x41, 0x6f, 0x7e,
+ 0x0e, 0x98, 0xe5, 0x44, 0x22, 0xa2, 0x00, 0x6c, 0xba, 0xaf, 0x51, 0xcc,
+ 0x9f, 0xba, 0x97, 0x39, 0xbb, 0x41, 0x60, 0xf0, 0xe9, 0xb7, 0xa7, 0xa0,
+ 0x7b, 0x7a, 0xde, 0xc9, 0x22, 0x13, 0xf4, 0x04, 0xaf, 0x91, 0xf5, 0x37,
+ 0x53, 0xad, 0x8d, 0x0d, 0x15, 0x7a, 0xf1, 0x81, 0x07, 0xd6, 0xa8, 0x80,
+ 0x0c, 0x8d, 0x02, 0x79, 0x43, 0x50, 0x98, 0x27, 0xfc, 0xbc, 0xb7, 0x8f,
+ 0xe0, 0xe6, 0x46, 0x6f, 0x25, 0xef, 0x2a, 0x04, 0xd1, 0xbe, 0x10, 0x3d,
+ 0xb4, 0x43, 0x3e, 0xf7, 0xea, 0xf4, 0xb8, 0x24, 0xdc, 0x77, 0x4f, 0x52,
+ 0x26, 0x55, 0xae, 0xbc, 0x6f, 0xe0, 0x8e, 0x41, 0x97, 0x82, 0xd4, 0xb5,
+ 0x77, 0x7c, 0xd6, 0xec, 0x26, 0x14, 0xb9, 0x11, 0x78, 0x34, 0x19, 0x90,
+ 0x87, 0xf8, 0xeb, 0x0b, 0x02, 0xfb, 0x48, 0x82, 0xe9, 0x1e, 0x37, 0x99,
+ 0xfd, 0x4a, 0x9f, 0xf6, 0x81, 0xf4, 0x87, 0x32, 0xa2, 0x45, 0x58, 0x0b,
+ 0xb3, 0xac, 0x43, 0xda, 0x76, 0x36, 0x84, 0xdb, 0xd6, 0x6c, 0x39, 0xb1,
+ 0xf3, 0xf1, 0xef, 0x78, 0x86, 0x53, 0x64, 0x6a, 0xb1, 0x80, 0xfe, 0xf3,
+ 0xe0, 0x4d, 0x70, 0x4c, 0x01, 0x72, 0x2f, 0x4f, 0x29, 0x2d, 0xc2, 0x5c,
+};
+static_assert(sizeof(kBytesTestReadSymbol16) == kNumBytesTestReadSymbol16, "");
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+
+Executor::~Executor() = default;
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_
+#define LIBGAV1_SRC_UTILS_EXECUTOR_H_
+
+#include <functional>
+
+namespace libgav1 {
+
+class Executor {
+ public:
+ virtual ~Executor();
+
+ // Schedules the specified "callback" for execution in this executor.
+ // Depending on the subclass implementation, this may block in some
+ // situations.
+ virtual void Schedule(std::function<void()> callback) = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_EXECUTOR_H_
--- /dev/null
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_)
+ return()
+endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_
+set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1)
+
+list(APPEND libgav1_utils_sources
+ "${libgav1_source}/utils/array_2d.h"
+ "${libgav1_source}/utils/bit_mask_set.h"
+ "${libgav1_source}/utils/bit_reader.cc"
+ "${libgav1_source}/utils/bit_reader.h"
+ "${libgav1_source}/utils/block_parameters_holder.cc"
+ "${libgav1_source}/utils/block_parameters_holder.h"
+ "${libgav1_source}/utils/blocking_counter.h"
+ "${libgav1_source}/utils/common.h"
+ "${libgav1_source}/utils/compiler_attributes.h"
+ "${libgav1_source}/utils/constants.cc"
+ "${libgav1_source}/utils/constants.h"
+ "${libgav1_source}/utils/cpu.cc"
+ "${libgav1_source}/utils/cpu.h"
+ "${libgav1_source}/utils/dynamic_buffer.h"
+ "${libgav1_source}/utils/entropy_decoder.cc"
+ "${libgav1_source}/utils/entropy_decoder.h"
+ "${libgav1_source}/utils/executor.cc"
+ "${libgav1_source}/utils/executor.h"
+ "${libgav1_source}/utils/logging.cc"
+ "${libgav1_source}/utils/logging.h"
+ "${libgav1_source}/utils/memory.h"
+ "${libgav1_source}/utils/queue.h"
+ "${libgav1_source}/utils/raw_bit_reader.cc"
+ "${libgav1_source}/utils/raw_bit_reader.h"
+ "${libgav1_source}/utils/reference_info.h"
+ "${libgav1_source}/utils/segmentation.cc"
+ "${libgav1_source}/utils/segmentation.h"
+ "${libgav1_source}/utils/segmentation_map.cc"
+ "${libgav1_source}/utils/segmentation_map.h"
+ "${libgav1_source}/utils/stack.h"
+ "${libgav1_source}/utils/threadpool.cc"
+ "${libgav1_source}/utils/threadpool.h"
+ "${libgav1_source}/utils/types.h"
+ "${libgav1_source}/utils/unbounded_queue.h"
+ "${libgav1_source}/utils/vector.h")
+
+macro(libgav1_add_utils_targets)
+ libgav1_add_library(NAME
+ libgav1_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths})
+
+endmacro()
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+#include <thread> // NOLINT (unapproved c++11 header)
+
+#if !defined(LIBGAV1_LOG_LEVEL)
+#define LIBGAV1_LOG_LEVEL (1 << 30)
+#endif
+
+namespace libgav1 {
+namespace internal {
+#if LIBGAV1_ENABLE_LOGGING
+namespace {
+
+const char* LogSeverityName(LogSeverity severity) {
+ switch (severity) {
+ case LogSeverity::kInfo:
+ return "INFO";
+ case LogSeverity::kError:
+ return "ERROR";
+ case LogSeverity::kWarning:
+ return "WARNING";
+ }
+ return "UNKNOWN";
+}
+
+} // namespace
+
+void Log(LogSeverity severity, const char* file, int line, const char* format,
+ ...) {
+ if (LIBGAV1_LOG_LEVEL < static_cast<int>(severity)) return;
+ std::ostringstream ss;
+ ss << std::hex << std::this_thread::get_id();
+ fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(),
+ file, line);
+
+ va_list ap;
+ va_start(ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ fprintf(stderr, "\n");
+}
+#else // !LIBGAV1_ENABLE_LOGGING
+void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
+ const char* /*format*/, ...) {}
+#endif // LIBGAV1_ENABLE_LOGGING
+
+} // namespace internal
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_
+#define LIBGAV1_SRC_UTILS_LOGGING_H_
+
+#include <cstddef>
+
+#include "src/utils/compiler_attributes.h"
+
+#if !defined(LIBGAV1_ENABLE_LOGGING)
+#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_ENABLE_LOGGING
+// LIBGAV1_DLOG(severity, printf-format-string)
+// Debug logging that can optionally be enabled in release builds by explicitly
+// setting LIBGAV1_ENABLE_LOGGING.
+// Severity is given as an all-caps version of enum LogSeverity with the
+// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ constexpr const char* libgav1_logging_internal_basename = \
+ libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
+ libgav1_logging_internal_basename, __LINE__, \
+ __VA_ARGS__); \
+ } while (0)
+#else
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ } while (0)
+#endif // LIBGAV1_ENABLE_LOGGING
+
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_WARNING \
+ libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
+
+namespace libgav1 {
+namespace internal {
+
+enum class LogSeverity : int {
+ kError,
+ kWarning,
+ kInfo,
+};
+
+// Helper function to implement LIBGAV1_DLOG
+// Logs |format, ...| at |severity| level, reporting it as called from
+// |file|:|line|.
+void Log(libgav1::internal::LogSeverity severity, const char* file, int line,
+ const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5);
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+ return (offset == 0 || file_name[offset - 1] == '/' ||
+ file_name[offset - 1] == '\\')
+ ? file_name + offset
+ : Basename(file_name, offset - 1);
+}
+
+} // namespace internal
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_LOGGING_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_
+#define LIBGAV1_SRC_UTILS_MEMORY_H_
+
+#if defined(__ANDROID__) || defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+
+namespace libgav1 {
+
+enum {
+// The byte alignment required for buffers used with SIMD code to be read or
+// written with aligned operations.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+ defined(_M_X64)
+ kMaxAlignment = 32, // extended alignment is safe on x86.
+#else
+ kMaxAlignment = alignof(max_align_t),
+#endif
+};
+
+// AlignedAlloc, AlignedFree
+//
+// void* AlignedAlloc(size_t alignment, size_t size);
+// Allocate aligned memory.
+// |alignment| must be a power of 2.
+// Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*).
+// Unlike aligned_alloc(), |size| does not need to be a multiple of
+// |alignment|.
+// The returned pointer should be freed by AlignedFree().
+//
+// void AlignedFree(void* aligned_memory);
+// Free aligned memory.
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+ return _aligned_malloc(size, alignment);
+}
+
+inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+#else // !(defined(_MSC_VER) || defined(__MINGW32__))
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(__ANDROID__)
+ // Although posix_memalign() was introduced in Android API level 17, it is
+ // more convenient to use memalign(). Unlike glibc, Android does not consider
+ // memalign() an obsolete function.
+ return memalign(alignment, size);
+#else // !defined(__ANDROID__)
+ void* ptr = nullptr;
+ // posix_memalign requires that the requested alignment be at least
+ // sizeof(void*). In this case, fall back on malloc which should return
+ // memory aligned to at least the size of a pointer.
+ const size_t required_alignment = sizeof(void*);
+ if (alignment < required_alignment) return malloc(size);
+ const int error = posix_memalign(&ptr, alignment, size);
+ if (error != 0) {
+ errno = error;
+ return nullptr;
+ }
+ return ptr;
+#endif // defined(__ANDROID__)
+}
+
+inline void AlignedFree(void* aligned_memory) { free(aligned_memory); }
+
+#endif // defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void Memset(uint8_t* const dst, int value, size_t count) {
+ memset(dst, value, count);
+}
+
+inline void Memset(uint16_t* const dst, int value, size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ dst[i] = static_cast<uint16_t>(value);
+ }
+}
+
+inline void Memset(int16_t* const dst, int value, size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ dst[i] = static_cast<int16_t>(value);
+ }
+}
+
+struct MallocDeleter {
+ void operator()(void* ptr) const { free(ptr); }
+};
+
+struct AlignedDeleter {
+ void operator()(void* ptr) const { AlignedFree(ptr); }
+};
+
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Allocates aligned memory for an array of |count| elements of type T.
+template <typename T>
+inline AlignedUniquePtr<T> MakeAlignedUniquePtr(size_t alignment,
+ size_t count) {
+ return AlignedUniquePtr<T>(
+ static_cast<T*>(AlignedAlloc(alignment, count * sizeof(T))));
+}
+
+// A base class with custom new and delete operators. The exception-throwing
+// new operators are deleted. The "new (std::nothrow)" form must be used.
+//
+// The new operators return nullptr if the requested size is greater than
+// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size
+// a compile-time configuration macro.
+//
+// See https://en.cppreference.com/w/cpp/memory/new/operator_new and
+// https://en.cppreference.com/w/cpp/memory/new/operator_delete.
+//
+// NOTE: The allocation and deallocation functions are static member functions
+// whether the keyword 'static' is used or not.
+struct Allocable {
+ // Class-specific allocation functions.
+ static void* operator new(size_t size) = delete;
+ static void* operator new[](size_t size) = delete;
+
+ // Class-specific non-throwing allocation functions
+ static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+ return ::operator new(size, tag);
+ }
+ static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+ return ::operator new[](size, tag);
+ }
+
+ // Class-specific deallocation functions.
+ static void operator delete(void* ptr) noexcept { ::operator delete(ptr); }
+ static void operator delete[](void* ptr) noexcept {
+ ::operator delete[](ptr);
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+ ::operator delete(ptr, tag);
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+ ::operator delete[](ptr, tag);
+ }
+};
+
+// A variant of Allocable that forces allocations to be aligned to
+// kMaxAlignment bytes. This is intended for use with classes that use
+// alignas() with this value. C++17 aligned new/delete are used if available,
+// otherwise we use AlignedAlloc/Free.
+struct MaxAlignedAllocable {
+ // Class-specific allocation functions.
+ static void* operator new(size_t size) = delete;
+ static void* operator new[](size_t size) = delete;
+
+ // Class-specific non-throwing allocation functions
+ static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+ return ::operator new(size, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ return AlignedAlloc(kMaxAlignment, size);
+#endif
+ }
+ static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+ return ::operator new[](size, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ return AlignedAlloc(kMaxAlignment, size);
+#endif
+ }
+
+ // Class-specific deallocation functions.
+ static void operator delete(void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete(ptr, std::align_val_t(kMaxAlignment));
+#else
+ AlignedFree(ptr);
+#endif
+ }
+ static void operator delete[](void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete[](ptr, std::align_val_t(kMaxAlignment));
+#else
+ AlignedFree(ptr);
+#endif
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ AlignedFree(ptr);
+#endif
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+ ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+ static_cast<void>(tag);
+ AlignedFree(ptr);
+#endif
+ }
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_MEMORY_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/memory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+struct Small : public Allocable {
+ uint8_t x;
+};
+
+struct Huge : public Allocable {
+ uint8_t x[kMaxAllocableSize + 1];
+};
+
+struct SmallMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x;
+};
+
+struct HugeMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct ThrowingConstructor : public Allocable {
+ ThrowingConstructor() { throw std::exception(); }
+
+ uint8_t x;
+};
+
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+ MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+ uint8_t x;
+};
+#endif
+
+TEST(MemoryTest, TestAlignedAllocFree) {
+ for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+ void* p = AlignedAlloc(alignment, 1);
+ // Note this additional check is to avoid an incorrect static-analysis
+ // warning for leaked memory with a plain ASSERT_NE().
+ if (p == nullptr) {
+ FAIL() << "AlignedAlloc(" << alignment << ", 1)";
+ }
+ const auto p_value = reinterpret_cast<uintptr_t>(p);
+ EXPECT_EQ(p_value % alignment, 0)
+ << "AlignedAlloc(" << alignment << ", 1) = " << p;
+ AlignedFree(p);
+ }
+}
+
+TEST(MemoryTest, TestAlignedUniquePtrAlloc) {
+ for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+ auto p = MakeAlignedUniquePtr<uint8_t>(alignment, 1);
+ ASSERT_NE(p, nullptr) << "MakeAlignedUniquePtr(" << alignment << ", 1)";
+ const auto p_value = reinterpret_cast<uintptr_t>(p.get());
+ EXPECT_EQ(p_value % alignment, 0)
+ << "MakeAlignedUniquePtr(" << alignment << ", 1) = " << p.get();
+ }
+}
+
+TEST(MemoryTest, TestAllocable) {
+ // Allocable::operator new (std::nothrow) is called.
+ std::unique_ptr<Small> small(new (std::nothrow) Small);
+ EXPECT_NE(small, nullptr);
+ // Allocable::operator delete is called.
+ small = nullptr;
+
+ // Allocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<Small[]> small_array_of_smalls(new (std::nothrow) Small[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ // Allocable::operator delete[] is called.
+ small_array_of_smalls = nullptr;
+
+ // Allocable::operator new (std::nothrow) is called.
+ std::unique_ptr<Huge> huge(new (std::nothrow) Huge);
+ EXPECT_EQ(huge, nullptr);
+
+ // Allocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<Small[]> huge_array_of_smalls(
+ new (std::nothrow) Small[kMaxAllocableSize / sizeof(Small) + 1]);
+ EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+ try {
+ // Allocable::operator new (std::nothrow) is called.
+ // The constructor throws an exception.
+ // Allocable::operator delete (std::nothrow) is called.
+ ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // Allocable::operator new[] (std::nothrow) is called.
+ // The constructor throws an exception.
+ // Allocable::operator delete[] (std::nothrow) is called.
+ ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+#endif // ABSL_HAVE_EXCEPTIONS
+}
+
+TEST(MemoryTest, TestMaxAlignedAllocable) {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+ EXPECT_NE(small, nullptr);
+ // Note this check doesn't guarantee conformance as a suitably aligned
+ // address may be returned from any allocator.
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1), 0);
+ // MaxAlignedAllocable::operator delete is called.
+ small = nullptr;
+
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+ new (std::nothrow) SmallMaxAligned[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+ (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete[] is called.
+ small_array_of_smalls = nullptr;
+
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+ EXPECT_EQ(huge, nullptr);
+
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+ new (std::nothrow)
+ SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+ EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+ try {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+#endif // ABSL_HAVE_EXCEPTIONS
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// A FIFO queue of a fixed capacity.
+//
+// WARNING: No error checking is performed.
+template <typename T>
+class Queue {
+ public:
+ LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) {
+ elements_.reset(new (std::nothrow) T[capacity]);
+ if (elements_ == nullptr) return false;
+ capacity_ = capacity;
+ return true;
+ }
+
+ // Pushes the element |value| to the end of the queue. It is an error to call
+ // Push() when the queue is full.
+ void Push(T&& value) {
+ assert(size_ < capacity_);
+ elements_[end_++] = std::move(value);
+ if (end_ == capacity_) end_ = 0;
+ ++size_;
+ }
+
+ // Removes the element at the front of the queue. It is an error to call Pop()
+ // when the queue is empty.
+ void Pop() {
+ assert(size_ != 0);
+ const T element = std::move(elements_[begin_++]);
+ static_cast<void>(element);
+ if (begin_ == capacity_) begin_ = 0;
+ --size_;
+ }
+
+ // Returns a reference to the element at the front of the queue. It is an
+ // error to call Front() when the queue is empty.
+ T& Front() {
+ assert(size_ != 0);
+ return elements_[begin_];
+ }
+
+ // Returns a reference to the element at the back of the queue. It is an error
+ // to call Back() when the queue is empty.
+ T& Back() {
+ assert(size_ != 0);
+ const size_t back = ((end_ == 0) ? capacity_ : end_) - 1;
+ return elements_[back];
+ }
+
+ // Clears the queue.
+ void Clear() {
+ while (!Empty()) {
+ Pop();
+ }
+ }
+
+ // Returns true if the queue is empty.
+ bool Empty() const { return size_ == 0; }
+
+ // Returns true if the queue is full.
+ bool Full() const { return size_ >= capacity_; }
+
+ // Returns the number of elements in the queue.
+ size_t Size() const { return size_; }
+
+ private:
+ // An array of |capacity| elements. Used as a circular array.
+ std::unique_ptr<T[]> elements_;
+ size_t capacity_ = 0;
+ // The index of the element to be removed by Pop().
+ size_t begin_ = 0;
+ // The index where the new element is inserted by Push().
+ size_t end_ = 0;
+ size_t size_ = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_QUEUE_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/queue.h"
+
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+struct TestClass {
+ TestClass() = default;
+ explicit TestClass(int i) : i(i) {}
+ int i;
+ // The vector exists simply so that the class is not trivially copyable.
+ std::vector<int> dummy;
+};
+
+TEST(QueueTest, Basic) {
+ Queue<TestClass> queue;
+ ASSERT_TRUE(queue.Init(8));
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_FALSE(queue.Full());
+ TestClass test(i);
+ queue.Push(std::move(test));
+ EXPECT_EQ(queue.Back().i, i);
+ EXPECT_FALSE(queue.Empty());
+ }
+ EXPECT_TRUE(queue.Full());
+
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_FALSE(queue.Empty());
+ EXPECT_EQ(queue.Front().i, i);
+ queue.Pop();
+ EXPECT_FALSE(queue.Full());
+ }
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_FALSE(queue.Full());
+ TestClass test(i);
+ queue.Push(std::move(test));
+ EXPECT_EQ(queue.Back().i, i);
+ EXPECT_FALSE(queue.Empty());
+ }
+ EXPECT_TRUE(queue.Full());
+ queue.Clear();
+ EXPECT_TRUE(queue.Empty());
+ EXPECT_FALSE(queue.Full());
+}
+
+TEST(QueueTest, WrapAround) {
+ Queue<TestClass> queue;
+ ASSERT_TRUE(queue.Init(8));
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 100; ++i) {
+ EXPECT_FALSE(queue.Full());
+ TestClass test(i);
+ queue.Push(std::move(test));
+ EXPECT_EQ(queue.Back().i, i);
+ EXPECT_FALSE(queue.Empty());
+ EXPECT_EQ(queue.Front().i, i);
+ queue.Pop();
+ EXPECT_TRUE(queue.Empty());
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <cassert>
+#include <limits>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+// Note <cinttypes> is only needed when logging is enabled (for the PRI*
+// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from
+// logging.h, thus the non-standard header ordering.
+#if LIBGAV1_ENABLE_LOGGING
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaximumLeb128Size = 8;
+constexpr uint8_t kLeb128ValueByteMask = 0x7f;
+constexpr uint8_t kLeb128TerminationByteMask = 0x80;
+
+uint8_t Mod8(size_t n) {
+ // Last 3 bits are the value of mod 8.
+ return n & 0x07;
+}
+
+size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; }
+
+} // namespace
+
+RawBitReader::RawBitReader(const uint8_t* data, size_t size)
+ : data_(data), bit_offset_(0), size_(size) {
+ assert(data_ != nullptr || size_ == 0);
+}
+
+int RawBitReader::ReadBitImpl() {
+ const size_t byte_offset = DivideBy8(bit_offset_, false);
+ const uint8_t byte = data_[byte_offset];
+ const uint8_t shift = 7 - Mod8(bit_offset_);
+ ++bit_offset_;
+ return static_cast<int>((byte >> shift) & 0x01);
+}
+
+int RawBitReader::ReadBit() {
+ if (Finished()) return -1;
+ return ReadBitImpl();
+}
+
+int64_t RawBitReader::ReadLiteral(int num_bits) {
+ assert(num_bits <= 32);
+ if (!CanReadLiteral(num_bits)) return -1;
+ assert(num_bits > 0);
+ uint32_t literal = 0;
+ int bit = num_bits - 1;
+ do {
+ // ARM can combine a shift operation with a constant number of bits with
+ // some other operations, such as the OR operation.
+ // Here is an ARM disassembly example:
+ // orr w1, w0, w1, lsl #1
+ // which left shifts register w1 by 1 bit and OR the shift result with
+ // register w0.
+ // The next 2 lines are equivalent to:
+ // literal |= static_cast<uint32_t>(ReadBitImpl()) << bit;
+ literal <<= 1;
+ literal |= static_cast<uint32_t>(ReadBitImpl());
+ } while (--bit >= 0);
+ return literal;
+}
+
+bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) {
+ assert(num_bits + 1 < 32);
+ *value = static_cast<int>(ReadLiteral(num_bits + 1));
+ if (*value == -1) return false;
+ const int sign_bit = 1 << num_bits;
+ if ((*value & sign_bit) != 0) {
+ *value -= 2 * sign_bit;
+ }
+ return true;
+}
+
+bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) {
+ // We must be at a byte boundary.
+ assert(Mod8(bit_offset_) == 0);
+ assert(num_bytes <= 4);
+ static_assert(sizeof(size_t) >= 4, "");
+ if (value == nullptr) return false;
+ size_t byte_offset = DivideBy8(bit_offset_, false);
+ if (Finished() || byte_offset + num_bytes > size_) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value.");
+ return false;
+ }
+ *value = 0;
+ for (int i = 0; i < num_bytes; ++i) {
+ const size_t byte = data_[byte_offset];
+ *value |= (byte << (i * 8));
+ ++byte_offset;
+ }
+ bit_offset_ = byte_offset * 8;
+ return true;
+}
+
+bool RawBitReader::ReadUnsignedLeb128(size_t* const value) {
+ // We must be at a byte boundary.
+ assert(Mod8(bit_offset_) == 0);
+ if (value == nullptr) return false;
+ uint64_t value64 = 0;
+ for (int i = 0; i < kMaximumLeb128Size; ++i) {
+ if (Finished()) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value.");
+ return false;
+ }
+ const size_t byte_offset = DivideBy8(bit_offset_, false);
+ const uint8_t byte = data_[byte_offset];
+ bit_offset_ += 8;
+ value64 |= static_cast<uint64_t>(byte & kLeb128ValueByteMask) << (i * 7);
+ if ((byte & kLeb128TerminationByteMask) == 0) {
+ if (value64 != static_cast<size_t>(value64) ||
+ value64 > std::numeric_limits<uint32_t>::max()) {
+ LIBGAV1_DLOG(
+ ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).",
+ value64, std::numeric_limits<uint32_t>::max());
+ return false;
+ }
+ *value = static_cast<size_t>(value64);
+ return true;
+ }
+ }
+ LIBGAV1_DLOG(
+ ERROR,
+ "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value",
+ kMaximumLeb128Size);
+ return false;
+}
+
+bool RawBitReader::ReadUvlc(uint32_t* const value) {
+ if (value == nullptr) return false;
+ int leading_zeros = 0;
+ while (true) {
+ const int bit = ReadBit();
+ if (bit == -1) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+ return false;
+ }
+ if (bit == 1) break;
+ ++leading_zeros;
+ if (leading_zeros == 32) {
+ LIBGAV1_DLOG(ERROR,
+ "Exceeded maximum size (32) when trying to read uvlc value");
+ return false;
+ }
+ }
+ int literal;
+ if (leading_zeros != 0) {
+ literal = static_cast<int>(ReadLiteral(leading_zeros));
+ if (literal == -1) {
+ LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+ return false;
+ }
+ literal += (1U << leading_zeros) - 1;
+ } else {
+ literal = 0;
+ }
+ *value = literal;
+ return true;
+}
+
+bool RawBitReader::AlignToNextByte() {
+ while ((bit_offset_ & 7) != 0) {
+ if (ReadBit() != 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) {
+ if (ReadBit() != 1) return false;
+ for (size_t i = 0; i < num_bits - 1; ++i) {
+ if (ReadBit() != 0) return false;
+ }
+ return true;
+}
+
+bool RawBitReader::SkipBytes(size_t num_bytes) {
+ // If we are not at a byte boundary, return false.
+ return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8);
+}
+
+bool RawBitReader::SkipBits(size_t num_bits) {
+ // If the reader is already finished, return false.
+ if (Finished()) return false;
+ // If skipping |num_bits| runs out of buffer, return false.
+ const size_t bit_offset = bit_offset_ + num_bits - 1;
+ if (DivideBy8(bit_offset, false) >= size_) return false;
+ bit_offset_ += num_bits;
+ return true;
+}
+
+bool RawBitReader::CanReadLiteral(size_t num_bits) const {
+ if (Finished()) return false;
+ const size_t bit_offset = bit_offset_ + num_bits - 1;
+ return DivideBy8(bit_offset, false) < size_;
+}
+
+bool RawBitReader::Finished() const {
+ return DivideBy8(bit_offset_, false) >= size_;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+class RawBitReader final : public BitReader, public Allocable {
+ public:
+ RawBitReader(const uint8_t* data, size_t size);
+ ~RawBitReader() override = default;
+
+ int ReadBit() override;
+ int64_t ReadLiteral(int num_bits) override; // f(n) in the spec.
+ bool ReadInverseSignedLiteral(int num_bits,
+ int* value); // su(1+num_bits) in the spec.
+ bool ReadLittleEndian(int num_bytes,
+ size_t* value); // le(n) in the spec.
+ bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec.
+ // Reads a variable length unsigned number and stores it in |*value|. On a
+ // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
+ // inclusive.
+ bool ReadUvlc(uint32_t* value); // uvlc() in the spec.
+ bool Finished() const;
+ size_t bit_offset() const { return bit_offset_; }
+ // Return the bytes consumed so far (rounded up).
+ size_t byte_offset() const { return (bit_offset() + 7) >> 3; }
+ size_t size() const { return size_; }
+ // Move to the next byte boundary if not already at one. Return false if any
+ // of the bits being skipped over is non-zero. Return true otherwise. If this
+ // function returns false, the reader is left in an undefined state and must
+ // not be used further. section 5.3.5.
+ bool AlignToNextByte();
+ // Make sure that the trailing bits structure is as expected and skip over it.
+ // section 5.3.4.
+ bool VerifyAndSkipTrailingBits(size_t num_bits);
+ // Skip |num_bytes| bytes. This only works if the current position is at a
+ // byte boundary. The function returns false if the current position is not at
+ // a byte boundary or if skipping |num_bytes| causes the reader to run out of
+ // buffer. Returns true otherwise.
+ bool SkipBytes(size_t num_bytes);
+ // Skip |num_bits| bits. The function returns false if skipping |num_bits|
+ // causes the reader to run out of buffer. Returns true otherwise.
+ bool SkipBits(size_t num_bits);
+
+ private:
+ // Returns true if it is safe to read a literal of size |num_bits|.
+ bool CanReadLiteral(size_t num_bits) const;
+ int ReadBitImpl();
+
+ const uint8_t* const data_;
+ size_t bit_offset_;
+ const size_t size_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <bitset>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+std::string IntegerToString(int x) { return std::bitset<8>(x).to_string(); }
+
+class RawBitReaderTest : public testing::TestWithParam<std::tuple<int, int>> {
+ protected:
+ RawBitReaderTest()
+ : literal_size_(std::get<0>(GetParam())),
+ test_data_size_(std::get<1>(GetParam())) {}
+
+ void CreateReader(const std::vector<uint8_t>& data) {
+ data_ = data;
+ raw_bit_reader_.reset(new (std::nothrow)
+ RawBitReader(data_.data(), data_.size()));
+ }
+
+ void CreateReader(int size) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ data_.clear();
+ for (int i = 0; i < size; ++i) {
+ data_.push_back(rnd.Rand8());
+ }
+ raw_bit_reader_.reset(new (std::nothrow)
+ RawBitReader(data_.data(), data_.size()));
+ }
+
+ // Some tests don't depend on |literal_size_|. For those tests, return true if
+ // the |literal_size_| is greater than 1. If this function returns true, the
+ // test will abort.
+ bool RunOnlyOnce() const { return literal_size_ > 1; }
+
+ std::unique_ptr<RawBitReader> raw_bit_reader_;
+ std::vector<uint8_t> data_;
+ int literal_size_;
+ int test_data_size_;
+};
+
+TEST_P(RawBitReaderTest, ReadBit) {
+ if (RunOnlyOnce()) return;
+ CreateReader(test_data_size_);
+ for (const auto& value : data_) {
+ const std::string expected = IntegerToString(value);
+ for (int j = 0; j < 8; ++j) {
+ EXPECT_FALSE(raw_bit_reader_->Finished());
+ EXPECT_EQ(static_cast<int>(expected[j] == '1'),
+ raw_bit_reader_->ReadBit());
+ }
+ }
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral) {
+ const int size_bytes = literal_size_;
+ const int size_bits = 8 * size_bytes;
+ CreateReader(test_data_size_ * size_bytes);
+ for (size_t i = 0; i < data_.size(); i += size_bytes) {
+ uint32_t expected_literal = 0;
+ for (int j = 0; j < size_bytes; ++j) {
+ expected_literal |=
+ static_cast<uint32_t>(data_[i + j] << (8 * (size_bytes - j - 1)));
+ }
+ EXPECT_FALSE(raw_bit_reader_->Finished());
+ const int64_t actual_literal = raw_bit_reader_->ReadLiteral(size_bits);
+ EXPECT_EQ(static_cast<int64_t>(expected_literal), actual_literal);
+ EXPECT_GE(actual_literal, 0);
+ }
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral32BitsWithMsbSet) {
+ if (RunOnlyOnce()) return;
+ // Three 32-bit values with MSB set.
+ CreateReader({0xff, 0xff, 0xff, 0xff, // 4294967295
+ 0x80, 0xff, 0xee, 0xdd, // 2164256477
+ 0xa0, 0xaa, 0xbb, 0xcc}); // 2695543756
+ static constexpr int64_t expected_literals[] = {4294967295, 2164256477,
+ 2695543756};
+ for (const int64_t expected_literal : expected_literals) {
+ EXPECT_FALSE(raw_bit_reader_->Finished());
+ const int64_t actual_literal = raw_bit_reader_->ReadLiteral(32);
+ EXPECT_EQ(expected_literal, actual_literal);
+ EXPECT_GE(actual_literal, 0);
+ }
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralNotEnoughBits) {
+ if (RunOnlyOnce()) return;
+ CreateReader(4); // 32 bits.
+ EXPECT_GE(raw_bit_reader_->ReadLiteral(16), 0);
+ EXPECT_EQ(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralMaxNumBits) {
+ if (RunOnlyOnce()) return;
+ CreateReader(4); // 32 bits.
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadInverseSignedLiteral) {
+ if (RunOnlyOnce()) return;
+ // This is the only usage for this function in the decoding process. So
+ // testing just that case.
+ const int size_bits = 6;
+ data_.clear();
+ // Negative value followed by a positive value.
+ data_.push_back(0xd2);
+ data_.push_back(0xa4);
+ raw_bit_reader_.reset(new (std::nothrow)
+ RawBitReader(data_.data(), data_.size()));
+ int value;
+ EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+ EXPECT_EQ(value, -23);
+ EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+ EXPECT_EQ(value, 41);
+ // We have only two bits left. Trying to read an inverse signed literal of 2
+ // bits actually needs 3 bits. So this should fail.
+ EXPECT_FALSE(raw_bit_reader_->ReadInverseSignedLiteral(2, &value));
+}
+
+TEST_P(RawBitReaderTest, ZeroSize) {
+ if (RunOnlyOnce()) return;
+ // Valid data, zero size.
+ data_.clear();
+ data_.push_back(0xf0);
+ raw_bit_reader_.reset(new (std::nothrow) RawBitReader(data_.data(), 0));
+ EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+ EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+ // NULL data, zero size.
+ raw_bit_reader_.reset(new (std::nothrow) RawBitReader(nullptr, 0));
+ EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+ EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+}
+
+TEST_P(RawBitReaderTest, AlignToNextByte) {
+ if (RunOnlyOnce()) return;
+ CreateReader({0x00, 0x00, 0x00, 0x0f});
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+ EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+ EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+ EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+ EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+ EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 25);
+ EXPECT_EQ(raw_bit_reader_->byte_offset(), 4);
+ // Some bits are non-zero.
+ EXPECT_FALSE(raw_bit_reader_->AlignToNextByte());
+}
+
+TEST_P(RawBitReaderTest, VerifyAndSkipTrailingBits) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data;
+
+ // 1 byte trailing byte.
+ data.push_back(0x80);
+ CreateReader(data);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+ EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+ // 2 byte trailing byte beginning at a byte-aligned offset.
+ data.clear();
+ data.push_back(0xf8);
+ data.push_back(0x80);
+ CreateReader(data);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+
+ // 2 byte trailing byte beginning at a non-byte-aligned offset.
+ data.clear();
+ data.push_back(0xf8);
+ data.push_back(0x00);
+ CreateReader(data);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+ EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(4));
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+ // Invalid trailing byte at a byte-aligned offset.
+ data.clear();
+ data.push_back(0xf7);
+ data.push_back(0x70);
+ CreateReader(data);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+
+ // Invalid trailing byte at a non-byte-aligned offset.
+ CreateReader(data);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+ EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(12));
+
+ // No more data available.
+ CreateReader(data);
+ EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+}
+
+TEST_P(RawBitReaderTest, ReadLittleEndian) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data;
+ size_t actual;
+
+ // Invalid input.
+ data.push_back(0x00); // dummy.
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(1, nullptr));
+
+ // One byte value.
+ data.clear();
+ data.push_back(0x01);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(1, &actual));
+ EXPECT_EQ(actual, 1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // One byte value with leading bytes.
+ data.clear();
+ data.push_back(0x01);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+ EXPECT_EQ(actual, 1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Two byte value.
+ data.clear();
+ data.push_back(0xD9);
+ data.push_back(0x01);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+ EXPECT_EQ(actual, 473);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Two byte value with leading bytes.
+ data.clear();
+ data.push_back(0xD9);
+ data.push_back(0x01);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+ EXPECT_EQ(actual, 473);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Not enough bytes.
+ data.clear();
+ data.push_back(0x01);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUnsignedLeb128) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data;
+ size_t actual;
+
+ // Invalid input.
+ data.push_back(0x00); // dummy.
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(nullptr));
+
+ // One byte value.
+ data.clear();
+ data.push_back(0x01);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+ EXPECT_EQ(actual, 1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // One byte value with trailing bytes.
+ data.clear();
+ data.push_back(0x81);
+ data.push_back(0x80);
+ data.push_back(0x80);
+ data.push_back(0x00);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+ EXPECT_EQ(actual, 1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Two byte value.
+ data.clear();
+ data.push_back(0xD9);
+ data.push_back(0x01);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+ EXPECT_EQ(actual, 217);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Two byte value with trailing bytes.
+ data.clear();
+ data.push_back(0xD9);
+ data.push_back(0x81);
+ data.push_back(0x80);
+ data.push_back(0x80);
+ data.push_back(0x00);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+ EXPECT_EQ(actual, 217);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 40);
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+
+ // Value > 32 bits.
+ data.clear();
+ for (int i = 0; i < 5; ++i) data.push_back(0xD9);
+ data.push_back(0x00);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+ // Not enough bytes (truncated leb128 value).
+ data.clear();
+ data.push_back(0x81);
+ data.push_back(0x81);
+ data.push_back(0x81);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+ // Exceeds kMaximumLeb128Size.
+ data.clear();
+ for (int i = 0; i < 10; ++i) data.push_back(0x80);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUvlc) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data;
+ uint32_t actual;
+
+ // Invalid input.
+ data.push_back(0x00); // dummy.
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUvlc(nullptr));
+
+ // Zero bit value.
+ data.clear();
+ data.push_back(0x80);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+ EXPECT_EQ(actual, 0);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+
+ // One bit value.
+ data.clear();
+ data.push_back(0x60); // 011...
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+ EXPECT_EQ(actual, 2);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 3);
+
+ // Two bit value.
+ data.clear();
+ data.push_back(0x38); // 00111...
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+ EXPECT_EQ(actual, 6);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 5);
+
+ // 31 bit value.
+ data.clear();
+ // (1 << 32) - 2 (= UINT32_MAX - 1) is the largest value that can be encoded
+ // as uvlc().
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x01);
+ data.push_back(0xFF);
+ data.push_back(0xFF);
+ data.push_back(0xFF);
+ data.push_back(0xFE);
+ CreateReader(data);
+ ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+ EXPECT_EQ(actual, UINT32_MAX - 1);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 63);
+
+ // Not enough bits (truncated uvlc value).
+ data.clear();
+ data.push_back(0x07);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+ // 32 bits.
+ data.clear();
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0xFF);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+ // Exceeds 32 bits.
+ data.clear();
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x00);
+ data.push_back(0x0F);
+ CreateReader(data);
+ EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeSignedSubexpWithReference) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data;
+ int actual;
+
+ data.push_back(0xa0); // v = 5;
+ CreateReader(data);
+ EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+ 10, 20, 15, kGlobalMotionReadControl, &actual));
+ EXPECT_EQ(actual, 12);
+
+ data.clear();
+ data.push_back(0xd0); // v = 6; extra_bit = 1;
+ CreateReader(data);
+ EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+ 10, 20, 15, kGlobalMotionReadControl, &actual));
+ EXPECT_EQ(actual, 11);
+
+ data.clear();
+ data.push_back(0xc8); // subexp_more_bits = 1; v = 9;
+ CreateReader(data);
+ EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+ 10, 40, 15, kGlobalMotionReadControl, &actual));
+ EXPECT_EQ(actual, 27);
+
+ data.clear();
+ data.push_back(0x60); // subexp_more_bits = 0; subexp_bits = 6.
+ CreateReader(data);
+ EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+ 10, 40, 15, kGlobalMotionReadControl, &actual));
+ EXPECT_EQ(actual, 18);
+
+ data.clear();
+ data.push_back(0x60);
+ CreateReader(data);
+ // Control is greater than 32, which makes b >= 32 in DecodeSubexp() and
+ // should return false.
+ EXPECT_FALSE(raw_bit_reader_->DecodeSignedSubexpWithReference(10, 40, 15, 35,
+ &actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeUniform) {
+ if (RunOnlyOnce()) return;
+ // Test the example from the AV1 spec, Section 4.10.7. ns(n).
+ // n = 5
+ // Value ns(n) encoding
+ // -------------------------------
+ // 0 00
+ // 1 01
+ // 2 10
+ // 3 110
+ // 4 111
+ //
+ // The five encoded values are concatenated into two bytes.
+ std::vector<uint8_t> data = {0x1b, 0x70};
+ CreateReader(data);
+ int actual;
+ for (int i = 0; i < 5; ++i) {
+ EXPECT_TRUE(raw_bit_reader_->DecodeUniform(5, &actual));
+ EXPECT_EQ(actual, i);
+ }
+
+ // If n is a power of 2, ns(n) is simply the log2(n)-bit representation of
+ // the unsigned number.
+ // Test n = 16.
+ // The 16 encoded values are concatenated into 8 bytes.
+ data = {0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
+ CreateReader(data);
+ for (int i = 0; i < 16; ++i) {
+ EXPECT_TRUE(raw_bit_reader_->DecodeUniform(16, &actual));
+ EXPECT_EQ(actual, i);
+ }
+}
+
+TEST_P(RawBitReaderTest, SkipBytes) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+ CreateReader(data);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+ EXPECT_TRUE(raw_bit_reader_->SkipBytes(1));
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+ EXPECT_FALSE(raw_bit_reader_->SkipBytes(1)); // Not at a byte boundary.
+ EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+ EXPECT_FALSE(raw_bit_reader_->SkipBytes(10)); // Not enough bytes.
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+ EXPECT_TRUE(raw_bit_reader_->SkipBytes(3));
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, SkipBits) {
+ if (RunOnlyOnce()) return;
+ std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+ CreateReader(data);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+ EXPECT_TRUE(raw_bit_reader_->SkipBits(8));
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+ EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+ EXPECT_TRUE(raw_bit_reader_->SkipBits(10)); // Not at a byte boundary.
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+ EXPECT_FALSE(raw_bit_reader_->SkipBits(80)); // Not enough bytes.
+ EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+ EXPECT_TRUE(raw_bit_reader_->SkipBits(21));
+ EXPECT_TRUE(raw_bit_reader_->Finished());
+ EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ RawBitReaderTestInstance, RawBitReaderTest,
+ testing::Combine(testing::Range(1, 5), // literal size.
+ testing::Values(100))); // number of bits/literals.
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+ // Initialize |motion_field_reference_frame| so that
+ // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+ // the updates are the same as the initialized value.
+ // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+ // branch conditions in motion field projection.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+ return motion_field_reference_frame.Reset(rows, columns,
+ /*zero_initialize=*/true) &&
+ motion_field_mv.Reset(
+ rows, columns,
+#if LIBGAV1_MSAN
+ // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+ // for qualified blocks. In MotionFieldProjectionKernel() dsp
+ // optimizations, it is read no matter it was set or not.
+ /*zero_initialize=*/true
+#else
+ /*zero_initialize=*/false
+#endif
+ );
+ }
+
+ // All members are used by inter frames only.
+ // For intra frames, they are not initialized.
+
+ std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+ // An example when |relative_distance_from| does not equal
+ // -|relative_distance_to|:
+ // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+ // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+ // This is why we need both |relative_distance_from| and
+ // |relative_distance_to|.
+ // |relative_distance_from|: Relative distances from reference frames to this
+ // frame.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+ // |relative_distance_to|: Relative distances to reference frames.
+ std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+ // Skip motion field projection of specific types of frames if their
+ // |relative_distance_to| is negative or too large.
+ std::array<bool, kNumReferenceFrameTypes> skip_references;
+ // Lookup table to get motion field projection division multiplier of specific
+ // types of frames. Derived from kProjectionMvDivisionLookup.
+ std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+ // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+ // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<ReferenceFrameType> motion_field_reference_frame;
+ // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+ // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+ Array2D<MotionVector> motion_field_mv;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+namespace libgav1 {
+
+const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6,
+ 6, 3, 0, 0};
+const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = {
+ 255,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ kMaxLoopFilterValue,
+ 7,
+ 0,
+ 0};
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax];
+extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_H_
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cassert>
+#include <cstring>
+#include <new>
+
+namespace libgav1 {
+
+bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+ if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) {
+ segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]);
+ }
+
+ rows4x4_ = rows4x4;
+ columns4x4_ = columns4x4;
+ if (segment_id_buffer_ == nullptr) return false;
+ segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
+ return true;
+}
+
+void SegmentationMap::Clear() {
+ memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::CopyFrom(const SegmentationMap& from) {
+ assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_);
+ memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(),
+ rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4,
+ int block_height4x4, int8_t segment_id) {
+ for (int y = 0; y < block_height4x4; ++y) {
+ memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4);
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// SegmentationMap stores the segment id associated with each 4x4 block in the
+// frame.
+class SegmentationMap {
+ public:
+ SegmentationMap() = default;
+
+ // Not copyable or movable
+ SegmentationMap(const SegmentationMap&) = delete;
+ SegmentationMap& operator=(const SegmentationMap&) = delete;
+
+ // Allocates an internal buffer of the given dimensions to hold the
+ // segmentation map. The memory in the buffer is not initialized. Returns
+ // true on success, false on failure (for example, out of memory).
+ LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4);
+
+ int8_t segment_id(int row4x4, int column4x4) const {
+ return segment_id_[row4x4][column4x4];
+ }
+
+ // Sets every element in the segmentation map to 0.
+ void Clear();
+
+ // Copies the entire segmentation map. |from| must be of the same dimensions.
+ void CopyFrom(const SegmentationMap& from);
+
+ // Sets the region of segmentation map covered by the block to |segment_id|.
+ // The block is located at |row4x4|, |column4x4| and has dimensions
+ // |block_width4x4| and |block_height4x4|.
+ void FillBlock(int row4x4, int column4x4, int block_width4x4,
+ int block_height4x4, int8_t segment_id);
+
+ private:
+ int32_t rows4x4_ = 0;
+ int32_t columns4x4_ = 0;
+
+ // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data
+ // buffer is dynamically allocated and owned by segment_id_buffer_.
+ std::unique_ptr<int8_t[]> segment_id_buffer_;
+ Array2DView<int8_t> segment_id_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SegmentationMapTest, Clear) {
+ constexpr int32_t kRows4x4 = 60;
+ constexpr int32_t kColumns4x4 = 80;
+ SegmentationMap segmentation_map;
+ ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+ segmentation_map.Clear();
+ for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+ EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 0);
+ }
+ }
+}
+
+TEST(SegmentationMapTest, FillBlock) {
+ constexpr int32_t kRows4x4 = 60;
+ constexpr int32_t kColumns4x4 = 80;
+ SegmentationMap segmentation_map;
+ ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+ // Fill the whole image with 2.
+ segmentation_map.FillBlock(0, 0, kColumns4x4, kRows4x4, 2);
+ // Fill a block with 1.
+ constexpr int kBlockWidth4x4 = 10;
+ constexpr int kBlockHeight4x4 = 20;
+ segmentation_map.FillBlock(4, 6, kBlockWidth4x4, kBlockHeight4x4, 1);
+ for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+ if (4 <= row4x4 && row4x4 < 4 + kBlockHeight4x4 && 6 <= column4x4 &&
+ column4x4 < 6 + kBlockWidth4x4) {
+ // Inside the block.
+ EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+ } else {
+ // Outside the block.
+ EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 2);
+ }
+ }
+ }
+}
+
+TEST(SegmentationMapTest, CopyFrom) {
+ constexpr int32_t kRows4x4 = 60;
+ constexpr int32_t kColumns4x4 = 80;
+ SegmentationMap segmentation_map;
+ ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+ // Split the segmentation map into four blocks of equal size.
+ constexpr int kBlockWidth4x4 = 40;
+ constexpr int kBlockHeight4x4 = 30;
+ segmentation_map.FillBlock(0, 0, kBlockWidth4x4, kBlockHeight4x4, 1);
+ segmentation_map.FillBlock(0, kBlockWidth4x4, kBlockWidth4x4, kBlockHeight4x4,
+ 2);
+ segmentation_map.FillBlock(kBlockHeight4x4, 0, kBlockWidth4x4,
+ kBlockHeight4x4, 3);
+ segmentation_map.FillBlock(kBlockHeight4x4, kBlockWidth4x4, kBlockWidth4x4,
+ kBlockHeight4x4, 4);
+
+ SegmentationMap segmentation_map2;
+ ASSERT_TRUE(segmentation_map2.Allocate(kRows4x4, kColumns4x4));
+ segmentation_map2.CopyFrom(segmentation_map);
+
+ for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+ EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+ EXPECT_EQ(segmentation_map2.segment_id(row4x4, column4x4), 1);
+ }
+ }
+ for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+ EXPECT_EQ(segmentation_map.segment_id(row4x4, kBlockWidth4x4 + column4x4),
+ 2);
+ EXPECT_EQ(
+ segmentation_map2.segment_id(row4x4, kBlockWidth4x4 + column4x4), 2);
+ }
+ }
+ for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+ EXPECT_EQ(
+ segmentation_map.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+ EXPECT_EQ(
+ segmentation_map2.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+ }
+ }
+ for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+ for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+ EXPECT_EQ(segmentation_map.segment_id(kBlockHeight4x4 + row4x4,
+ kBlockWidth4x4 + column4x4),
+ 4);
+ EXPECT_EQ(segmentation_map2.segment_id(kBlockHeight4x4 + row4x4,
+ kBlockWidth4x4 + column4x4),
+ 4);
+ }
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int GetUnsignedBits(const unsigned int num_values) {
+ return (num_values > 0) ? FloorLog2(num_values) + 1 : 0;
+}
+
+// Check that kSegmentationFeatureBits and kSegmentationFeatureMaxValues are
+// consistent with each other.
+TEST(SegmentationTest, FeatureBitsAndMaxValuesConsistency) {
+ for (int feature = 0; feature < kSegmentFeatureMax; feature++) {
+ EXPECT_EQ(kSegmentationFeatureBits[feature],
+ GetUnsignedBits(kSegmentationFeatureMaxValues[feature]));
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_STACK_H_
+#define LIBGAV1_SRC_UTILS_STACK_H_
+
+#include <cassert>
+#include <utility>
+
+namespace libgav1 {
+
+// A LIFO stack of a fixed capacity. The elements are moved using std::move, so
+// the element type T has to be movable.
+//
+// WARNING: No error checking is performed.
+template <typename T, int capacity>
+class Stack {
+ public:
+ // Pushes the element |value| to the top of the stack. It is an error to call
+ // Push() when the stack is full.
+ void Push(T value) {
+ ++top_;
+ assert(top_ < capacity);
+ elements_[top_] = std::move(value);
+ }
+
+ // Returns the element at the top of the stack and removes it from the stack.
+ // It is an error to call Pop() when the stack is empty.
+ T Pop() {
+ assert(top_ >= 0);
+ return std::move(elements_[top_--]);
+ }
+
+ // Returns true if the stack is empty.
+ bool Empty() const { return top_ < 0; }
+
+ private:
+ static_assert(capacity > 0, "");
+ T elements_[capacity];
+ // The array index of the top of the stack. The stack is empty if top_ is -1.
+ int top_ = -1;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_STACK_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/stack.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStackSize = 8;
+
+TEST(StackTest, SimpleType) {
+ Stack<int, kStackSize> stack;
+ EXPECT_TRUE(stack.Empty());
+
+ for (int i = 0; i < kStackSize; ++i) {
+ stack.Push(i);
+ EXPECT_FALSE(stack.Empty());
+ }
+
+ for (int i = kStackSize - 1; i >= 0; --i) {
+ EXPECT_EQ(stack.Pop(), i);
+ }
+ EXPECT_TRUE(stack.Empty());
+}
+
+TEST(StackTest, LargeStruct) {
+ struct LargeMoveOnlyStruct {
+ LargeMoveOnlyStruct() = default;
+ // Move only.
+ LargeMoveOnlyStruct(LargeMoveOnlyStruct&& other) = default;
+ LargeMoveOnlyStruct& operator=(LargeMoveOnlyStruct&& other) = default;
+
+ int32_t array1[1000];
+ uint64_t array2[2000];
+ };
+
+ Stack<LargeMoveOnlyStruct, kStackSize> stack;
+ EXPECT_TRUE(stack.Empty());
+
+ LargeMoveOnlyStruct large_move_only_struct[kStackSize];
+ for (int i = 0; i < kStackSize; ++i) {
+ LargeMoveOnlyStruct& l = large_move_only_struct[i];
+ l.array1[0] = i;
+ l.array2[0] = i;
+ stack.Push(std::move(l));
+ EXPECT_FALSE(stack.Empty());
+ }
+
+ for (int i = kStackSize - 1; i >= 0; --i) {
+ LargeMoveOnlyStruct l = stack.Pop();
+ EXPECT_EQ(l.array1[0], i);
+ EXPECT_EQ(l.array2[0], i);
+ }
+ EXPECT_TRUE(stack.Empty());
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#if defined(_MSC_VER)
+#include <process.h>
+#include <windows.h>
+#else // defined(_MSC_VER)
+#include <pthread.h>
+#endif // defined(_MSC_VER)
+#if defined(__ANDROID__) || defined(__GLIBC__)
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <utility>
+
+#if defined(__ANDROID__)
+#include <chrono> // NOLINT (unapproved c++11 header)
+#endif
+
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
+// The glibc wrapper for the gettid() system call was added in glibc 2.30.
+// Emulate it for older versions of glibc.
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else // Older than glibc 2.30
+#include <sys/syscall.h>
+
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif // glibc 2.30 or later.
+#endif // defined(__GLIBC__)
+
+namespace libgav1 {
+
+#if defined(__ANDROID__)
+namespace {
+
+using Clock = std::chrono::steady_clock;
+using Duration = Clock::duration;
+constexpr Duration kBusyWaitDuration =
+ std::chrono::duration_cast<Duration>(std::chrono::duration<double>(2e-3));
+
+} // namespace
+#endif // defined(__ANDROID__)
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(int num_threads) {
+ return Create(/*name_prefix=*/"", num_threads);
+}
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(const char name_prefix[],
+ int num_threads) {
+ if (name_prefix == nullptr || num_threads <= 0) return nullptr;
+ std::unique_ptr<WorkerThread*[]> threads(new (std::nothrow)
+ WorkerThread*[num_threads]);
+ if (threads == nullptr) return nullptr;
+ std::unique_ptr<ThreadPool> pool(new (std::nothrow) ThreadPool(
+ name_prefix, std::move(threads), num_threads));
+ if (pool != nullptr && !pool->StartWorkers()) {
+ pool = nullptr;
+ }
+ return pool;
+}
+
+ThreadPool::ThreadPool(const char name_prefix[],
+ std::unique_ptr<WorkerThread*[]> threads,
+ int num_threads)
+ : threads_(std::move(threads)), num_threads_(num_threads) {
+ threads_[0] = nullptr;
+ assert(name_prefix != nullptr);
+ const size_t name_prefix_len =
+ std::min(strlen(name_prefix), sizeof(name_prefix_) - 1);
+ memcpy(name_prefix_, name_prefix, name_prefix_len);
+ name_prefix_[name_prefix_len] = '\0';
+}
+
+ThreadPool::~ThreadPool() { Shutdown(); }
+
+void ThreadPool::Schedule(std::function<void()> closure) {
+ LockMutex();
+ if (!queue_.GrowIfNeeded()) {
+ // queue_ is full and we can't grow it. Run |closure| directly.
+ UnlockMutex();
+ closure();
+ return;
+ }
+ queue_.Push(std::move(closure));
+ UnlockMutex();
+ SignalOne();
+}
+
+int ThreadPool::num_threads() const { return num_threads_; }
+
+// A simple implementation that mirrors the non-portable Thread. We may
+// choose to expand this in the future as a portable implementation of
+// Thread, or replace it at such a time as one is implemented.
+class ThreadPool::WorkerThread : public Allocable {
+ public:
+ // Creates and starts a thread that runs pool->WorkerFunction().
+ explicit WorkerThread(ThreadPool* pool);
+
+ // Not copyable or movable.
+ WorkerThread(const WorkerThread&) = delete;
+ WorkerThread& operator=(const WorkerThread&) = delete;
+
+ // REQUIRES: Join() must have been called if Start() was called and
+ // succeeded.
+ ~WorkerThread() = default;
+
+ LIBGAV1_MUST_USE_RESULT bool Start();
+
+ // Joins with the running thread.
+ void Join();
+
+ private:
+#if defined(_MSC_VER)
+ static unsigned int __stdcall ThreadBody(void* arg);
+#else
+ static void* ThreadBody(void* arg);
+#endif
+
+ void SetupName();
+ void Run();
+
+ ThreadPool* pool_;
+#if defined(_MSC_VER)
+ HANDLE handle_;
+#else
+ pthread_t thread_;
+#endif
+};
+
+ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {}
+
+#if defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+ // Since our code calls the C run-time library (CRT), use _beginthreadex
+ // rather than CreateThread. Microsoft documentation says "If a thread
+ // created using CreateThread calls the CRT, the CRT may terminate the
+ // process in low-memory conditions."
+ uintptr_t handle = _beginthreadex(
+ /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this,
+ /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr);
+ if (handle == 0) return false;
+ handle_ = reinterpret_cast<HANDLE>(handle);
+ ResumeThread(handle_);
+ return true;
+}
+
+void ThreadPool::WorkerThread::Join() {
+ WaitForSingleObject(handle_, INFINITE);
+ CloseHandle(handle_);
+}
+
+unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) {
+ auto* thread = static_cast<WorkerThread*>(arg);
+ thread->Run();
+ return 0;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+ // Not currently supported on Windows.
+}
+
+#else // defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+ return pthread_create(&thread_, nullptr, ThreadBody, this) == 0;
+}
+
+void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); }
+
+void* ThreadPool::WorkerThread::ThreadBody(void* arg) {
+ auto* thread = static_cast<WorkerThread*>(arg);
+ thread->Run();
+ return nullptr;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+ if (pool_->name_prefix_[0] != '\0') {
+#if defined(__APPLE__)
+ // Apple's version of pthread_setname_np takes one argument and operates on
+ // the current thread only. Also, pthread_mach_thread_np is Apple-specific.
+ // The maximum size of the |name| buffer was noted in the Chromium source
+ // code and was confirmed by experiments.
+ char name[64];
+ mach_port_t id = pthread_mach_thread_np(pthread_self());
+ int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+ static_cast<int64_t>(id));
+ assert(rv >= 0);
+ rv = pthread_setname_np(name);
+ assert(rv == 0);
+ static_cast<void>(rv);
+#elif defined(__ANDROID__) || (defined(__GLIBC__) && !defined(__GNU__))
+ // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
+ // with error 34 (ERANGE) on Android.
+ char name[16];
+ pid_t id = GetTid();
+ int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+ static_cast<int64_t>(id));
+ assert(rv >= 0);
+ rv = pthread_setname_np(pthread_self(), name);
+ assert(rv == 0);
+ static_cast<void>(rv);
+#endif
+ }
+}
+
+#endif // defined(_MSC_VER)
+
+void ThreadPool::WorkerThread::Run() {
+ SetupName();
+ pool_->WorkerFunction();
+}
+
+bool ThreadPool::StartWorkers() {
+ if (!queue_.Init()) return false;
+ for (int i = 0; i < num_threads_; ++i) {
+ threads_[i] = new (std::nothrow) WorkerThread(this);
+ if (threads_[i] == nullptr) return false;
+ if (!threads_[i]->Start()) {
+ delete threads_[i];
+ threads_[i] = nullptr;
+ return false;
+ }
+ }
+ return true;
+}
+
+void ThreadPool::WorkerFunction() {
+ LockMutex();
+ while (true) {
+ if (queue_.Empty()) {
+ if (exit_threads_) {
+ break; // Queue is empty and exit was requested.
+ }
+#if defined(__ANDROID__)
+ // On android, if we go to a conditional wait right away, the CPU governor
+ // kicks in and starts shutting the cores down. So we do a very small busy
+ // wait to see if we get our next job within that period. This
+ // significantly improves the performance of common cases of tile parallel
+ // decoding. If we don't receive a job in the busy wait time, we then go
+ // to an actual conditional wait as usual.
+ UnlockMutex();
+ bool found_job = false;
+ const auto wait_start = Clock::now();
+ while (Clock::now() - wait_start < kBusyWaitDuration) {
+ LockMutex();
+ if (!queue_.Empty()) {
+ found_job = true;
+ break;
+ }
+ UnlockMutex();
+ }
+ // If |found_job| is true, we simply continue since we already hold the
+ // mutex and we know for sure that the |queue_| is not empty.
+ if (found_job) continue;
+ // Since |found_job_| was false, the mutex is not being held at this
+ // point.
+ LockMutex();
+ // Ensure that the queue is still empty.
+ if (!queue_.Empty()) continue;
+ if (exit_threads_) {
+ break; // Queue is empty and exit was requested.
+ }
+#endif // defined(__ANDROID__)
+ // Queue is still empty, wait for signal or broadcast.
+ Wait();
+ } else {
+ // Take a job from the queue.
+ std::function<void()> job = std::move(queue_.Front());
+ queue_.Pop();
+
+ UnlockMutex();
+ // Note that it is good practice to surround this with a try/catch so
+ // the thread pool doesn't go to hell if the job throws an exception.
+ // This is omitted here because Google3 doesn't like exceptions.
+ std::move(job)();
+ job = nullptr;
+
+ LockMutex();
+ }
+ }
+ UnlockMutex();
+}
+
+void ThreadPool::Shutdown() {
+ // Tell worker threads how to exit.
+ LockMutex();
+ exit_threads_ = true;
+ UnlockMutex();
+ SignalAll();
+
+ // Join all workers. This will block.
+ for (int i = 0; i < num_threads_; ++i) {
+ if (threads_[i] == nullptr) break;
+ threads_[i]->Join();
+ delete threads_[i];
+ }
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_
+#define LIBGAV1_SRC_UTILS_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1
+#else
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0
+#endif
+#endif
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+#include <condition_variable> // NOLINT (unapproved c++11 header)
+#include <mutex> // NOLINT (unapproved c++11 header)
+#else
+// absl::Mutex & absl::CondVar are significantly faster than the pthread
+// variants on platforms other than Android. iOS may deadlock on Shutdown()
+// using absl, see b/142251739.
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#endif
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+#include "src/utils/memory.h"
+#include "src/utils/unbounded_queue.h"
+
+namespace libgav1 {
+
+// An implementation of ThreadPool using POSIX threads (pthreads) or Windows
+// threads.
+//
+// - The pool allocates a fixed number of worker threads on instantiation.
+// - The worker threads will pick up work jobs as they arrive.
+// - If all workers are busy, work jobs are queued for later execution.
+//
+// The thread pool is shut down when the pool is destroyed.
+//
+// Example usage of the thread pool:
+// {
+// std::unique_ptr<ThreadPool> pool = ThreadPool::Create(4);
+// for (int i = 0; i < 100; ++i) { // Dispatch 100 jobs.
+// pool->Schedule([&my_data]() { MyFunction(&my_data); });
+// }
+// } // ThreadPool gets destroyed only when all jobs are done.
+class ThreadPool : public Executor, public Allocable {
+ public:
+ // Creates the thread pool with the specified number of worker threads.
+ // If num_threads is 1, the closures are run in FIFO order.
+ static std::unique_ptr<ThreadPool> Create(int num_threads);
+
+ // Like the above factory method, but also sets the name prefix for threads.
+ static std::unique_ptr<ThreadPool> Create(const char name_prefix[],
+ int num_threads);
+
+ // The destructor will shut down the thread pool and all jobs are executed.
+ // Note that after shutdown, the thread pool does not accept further jobs.
+ ~ThreadPool() override;
+
+ // Adds the specified "closure" to the queue for processing. If worker threads
+ // are available, "closure" will run immediately. Otherwise "closure" is
+ // queued for later execution.
+ //
+ // NOTE: If the internal queue is full and cannot be resized because of an
+ // out-of-memory error, the current thread runs "closure" before returning
+ // from Schedule(). For our use cases, this seems better than the
+ // alternatives:
+ // 1. Return a failure status.
+ // 2. Have the current thread wait until the queue is not full.
+ void Schedule(std::function<void()> closure) override;
+
+ int num_threads() const;
+
+ private:
+ class WorkerThread;
+
+ // Creates the thread pool with the specified number of worker threads.
+ // If num_threads is 1, the closures are run in FIFO order.
+ ThreadPool(const char name_prefix[], std::unique_ptr<WorkerThread*[]> threads,
+ int num_threads);
+
+ // Starts the worker pool.
+ LIBGAV1_MUST_USE_RESULT bool StartWorkers();
+
+ void WorkerFunction();
+
+ // Shuts down the thread pool, i.e. worker threads finish their work and
+ // pick up new jobs until the queue is empty. This call will block until
+ // the shutdown is complete.
+ //
+ // Note: If a worker encounters an empty queue after this call, it will exit.
+ // Other workers might still be running, and if the queue fills up again, the
+ // thread pool will continue to operate with a decreased number of workers.
+ // It is up to the caller to prevent adding new jobs.
+ void Shutdown();
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ void LockMutex() { queue_mutex_.lock(); }
+ void UnlockMutex() { queue_mutex_.unlock(); }
+
+ void Wait() {
+ std::unique_lock<std::mutex> queue_lock(queue_mutex_, std::adopt_lock);
+ condition_.wait(queue_lock);
+ queue_lock.release();
+ }
+
+ void SignalOne() { condition_.notify_one(); }
+ void SignalAll() { condition_.notify_all(); }
+
+ std::condition_variable condition_;
+ std::mutex queue_mutex_;
+
+#else // !LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); }
+ void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); }
+ void Wait() { condition_.Wait(&queue_mutex_); }
+ void SignalOne() { condition_.Signal(); }
+ void SignalAll() { condition_.SignalAll(); }
+
+ absl::CondVar condition_;
+ absl::Mutex queue_mutex_;
+
+#endif // LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+ UnboundedQueue<std::function<void()>> queue_ LIBGAV1_GUARDED_BY(queue_mutex_);
+ // If not all the worker threads are created, the first entry after the
+ // created worker threads is a null pointer.
+ const std::unique_ptr<WorkerThread*[]> threads_;
+
+ bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false;
+ const int num_threads_ = 0;
+ // name_prefix_ is a C string, whose length is restricted to 16 characters,
+ // including the terminating null byte ('\0'). This restriction comes from
+ // the Linux pthread_setname_np() function.
+ char name_prefix_[16];
+};
+
+} // namespace libgav1
+
+#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+#endif // LIBGAV1_SRC_UTILS_THREADPOOL_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+namespace {
+
+class SimpleGuardedInteger {
+ public:
+ explicit SimpleGuardedInteger(int initial_value) : value_(initial_value) {}
+ SimpleGuardedInteger(const SimpleGuardedInteger&) = delete;
+ SimpleGuardedInteger& operator=(const SimpleGuardedInteger&) = delete;
+
+ void Decrement() {
+ absl::MutexLock l(&mutex_);
+ assert(value_ >= 1);
+ --value_;
+ changed_.SignalAll();
+ }
+
+ void Increment() {
+ absl::MutexLock l(&mutex_);
+ ++value_;
+ changed_.SignalAll();
+ }
+
+ int Value() {
+ absl::MutexLock l(&mutex_);
+ return value_;
+ }
+
+ void WaitForZero() {
+ absl::MutexLock l(&mutex_);
+ while (value_ != 0) {
+ changed_.Wait(&mutex_);
+ }
+ }
+
+ private:
+ absl::Mutex mutex_;
+ absl::CondVar changed_;
+ int value_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+// Loops for |milliseconds| of wall-clock time.
+void LoopForMs(int64_t milliseconds) {
+ const absl::Time deadline = absl::Now() + absl::Milliseconds(milliseconds);
+ while (absl::Now() < deadline) {
+ }
+}
+
+// A function that increments the given integer.
+void IncrementIntegerJob(SimpleGuardedInteger* value) {
+ LoopForMs(100);
+ value->Increment();
+}
+
+TEST(ThreadPoolTest, ThreadedIntegerIncrement) {
+ std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+ ASSERT_NE(thread_pool, nullptr);
+ EXPECT_EQ(thread_pool->num_threads(), 100);
+ SimpleGuardedInteger count(0);
+ for (int i = 0; i < 1000; ++i) {
+ thread_pool->Schedule([&count]() { IncrementIntegerJob(&count); });
+ }
+ thread_pool.reset(nullptr);
+ EXPECT_EQ(count.Value(), 1000);
+}
+
+// Test a ThreadPool via the Executor interface.
+TEST(ThreadPoolTest, ExecutorInterface) {
+ std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+ ASSERT_NE(thread_pool, nullptr);
+ std::unique_ptr<Executor> executor(thread_pool.release());
+ SimpleGuardedInteger count(0);
+ for (int i = 0; i < 1000; ++i) {
+ executor->Schedule([&count]() { IncrementIntegerJob(&count); });
+ }
+ executor.reset(nullptr);
+ EXPECT_EQ(count.Value(), 1000);
+}
+
+TEST(ThreadPoolTest, DestroyWithoutUse) {
+ std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+ EXPECT_NE(thread_pool, nullptr);
+ thread_pool.reset(nullptr);
+}
+
+// If num_threads is 0, ThreadPool::Create() should return a null pointer.
+TEST(ThreadPoolTest, NumThreadsZero) {
+ std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(0);
+ EXPECT_EQ(thread_pool, nullptr);
+}
+
+// If num_threads is 1, the closures are run in FIFO order.
+TEST(ThreadPoolTest, OneThreadRunsClosuresFIFO) {
+ int count = 0; // Declare first so that it outlives the thread pool.
+ std::unique_ptr<ThreadPool> pool = ThreadPool::Create(1);
+ ASSERT_NE(pool, nullptr);
+ EXPECT_EQ(pool->num_threads(), 1);
+ for (int i = 0; i < 1000; ++i) {
+ pool->Schedule([&count, i]() {
+ EXPECT_EQ(count, i);
+ count++;
+ });
+ }
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_TYPES_H_
+#define LIBGAV1_SRC_UTILS_TYPES_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+union MotionVector {
+ // Motion vectors will always fit in int16_t and using int16_t here instead
+ // of int saves significant memory since some of the frame sized structures
+ // store motion vectors.
+ // Index 0 is the entry for row (horizontal direction) motion vector.
+ // Index 1 is the entry for column (vertical direction) motion vector.
+ int16_t mv[2];
+ // A uint32_t view into the |mv| array. Useful for cases where both the
+ // motion vectors have to be copied or compared with a single 32 bit
+ // instruction.
+ uint32_t mv32;
+};
+
+union CompoundMotionVector {
+ MotionVector mv[2];
+ // A uint64_t view into the |mv| array. Useful for cases where all the motion
+ // vectors have to be copied or compared with a single 64 bit instruction.
+ uint64_t mv64;
+};
+
+// Stores the motion information used for motion field estimation.
+struct TemporalMotionField : public Allocable {
+ Array2D<MotionVector> mv;
+ Array2D<int8_t> reference_offset;
+};
+
+// MvContexts contains the contexts used to decode portions of an inter block
+// mode info to set the y_mode field in BlockParameters.
+//
+// The contexts in the struct correspond to the ZeroMvContext, RefMvContext,
+// and NewMvContext variables in the spec.
+struct MvContexts {
+ int zero_mv;
+ int reference_mv;
+ int new_mv;
+};
+
+struct PaletteModeInfo {
+ uint8_t size[kNumPlaneTypes];
+ uint16_t color[kMaxPlanes][kMaxPaletteSize];
+};
+
+// Stores the parameters used by the prediction process. The members of the
+// struct are filled in when parsing the bitstream and used when the prediction
+// is computed. The information in this struct is associated with a single
+// block.
+// While both BlockParameters and PredictionParameters store information
+// pertaining to a Block, the only difference is that BlockParameters outlives
+// the block itself (for example, some of the variables in BlockParameters are
+// used to compute the context for reading elements in the subsequent blocks).
+struct PredictionParameters : public Allocable {
+ // Restore the index in the unsorted mv stack from the least 3 bits of sorted
+ // |weight_index_stack|.
+ const MotionVector& reference_mv(int stack_index) const {
+ return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)];
+ }
+ const MotionVector& reference_mv(int stack_index, int mv_index) const {
+ return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]
+ .mv[mv_index];
+ }
+
+ void IncreaseWeight(ptrdiff_t index, int weight) {
+ weight_index_stack[index] += weight << 3;
+ }
+
+ void SetWeightIndexStackEntry(int index, int weight) {
+ weight_index_stack[index] = (weight << 3) + 7 - index;
+ }
+
+ bool use_filter_intra;
+ FilterIntraPredictor filter_intra_mode;
+ int angle_delta[kNumPlaneTypes];
+ int8_t cfl_alpha_u;
+ int8_t cfl_alpha_v;
+ int max_luma_width;
+ int max_luma_height;
+ Array2D<uint8_t> color_index_map[kNumPlaneTypes];
+ bool use_intra_block_copy;
+ InterIntraMode inter_intra_mode;
+ bool is_wedge_inter_intra;
+ int wedge_index;
+ int wedge_sign;
+ bool mask_is_inverse;
+ MotionMode motion_mode;
+ CompoundPredictionType compound_prediction_type;
+ union {
+ // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after
+ // construction. reference_mv() must be called to get the correct element.
+ MotionVector ref_mv_stack[kMaxRefMvStackSize];
+ CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize];
+ };
+ // The least 3 bits of |weight_index_stack| store the index information, and
+ // the other bits store the weight. The index information is actually 7 -
+ // index to make the descending order sort stable (preserves the original
+ // order for elements with the same weight). Sorting an int16_t array is much
+ // faster than sorting a struct array with weight and index stored separately.
+ int16_t weight_index_stack[kMaxRefMvStackSize];
+ // In the spec, the weights of all the nearest mvs are incremented by a bonus
+ // weight which is larger than any natural weight, and later the weights of
+ // the mvs are compared with this bonus weight to determine their contexts. We
+ // replace this procedure by introducing |nearest_mv_count|, which records the
+ // count of the nearest mvs. Since all the nearest mvs are in the beginning of
+ // the mv stack, the index of a mv in the mv stack can be compared with
+ // |nearest_mv_count| to get that mv's context.
+ int nearest_mv_count;
+ int ref_mv_count;
+ int ref_mv_index;
+ MotionVector global_mv[2];
+ int num_warp_samples;
+ int warp_estimate_candidates[kMaxLeastSquaresSamples][4];
+ PaletteModeInfo palette_mode_info;
+ int8_t segment_id; // segment_id is in the range [0, 7].
+ PredictionMode uv_mode;
+ bool chroma_top_uses_smooth_prediction;
+ bool chroma_left_uses_smooth_prediction;
+};
+
+// A lot of BlockParameters objects are created, so the smallest type is used
+// for each field. The ranges of some fields are documented to justify why
+// their types are large enough.
+struct BlockParameters : public Allocable {
+ BlockSize size;
+ bool skip;
+ bool is_inter;
+ PredictionMode y_mode;
+ TransformSize uv_transform_size;
+ InterpolationFilter interpolation_filter[2];
+ ReferenceFrameType reference_frame[2];
+ // The index of this array is as follows:
+ // 0 - Y plane vertical filtering.
+ // 1 - Y plane horizontal filtering.
+ // 2 - U plane (both directions).
+ // 3 - V plane (both directions).
+ uint8_t deblock_filter_level[kFrameLfCount];
+ CompoundMotionVector mv;
+ // When |Tile::split_parse_and_decode_| is true, each block gets its own
+ // instance of |prediction_parameters|. When it is false, all the blocks point
+ // to |Tile::prediction_parameters_|. This field is valid only as long as the
+ // block is *being* decoded. The lifetime and usage of this field can be
+ // better understood by following its flow in tile.cc.
+ std::unique_ptr<PredictionParameters> prediction_parameters;
+};
+
+// Used to store the left and top block parameters that are used for computing
+// the cdf context of the subsequent blocks.
+struct BlockCdfContext {
+ bool use_predicted_segment_id[32];
+ bool is_explicit_compound_type[32]; // comp_group_idx in the spec.
+ bool is_compound_type_average[32]; // compound_idx in the spec.
+ bool skip_mode[32];
+ uint8_t palette_size[kNumPlaneTypes][32];
+ uint16_t palette_color[32][kNumPlaneTypes][kMaxPaletteSize];
+ PredictionMode uv_mode[32];
+};
+
+// A five dimensional array used to store the wedge masks. The dimensions are:
+// - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc).
+// - flip_sign (0 or 1).
+// - wedge_index (0 to 15).
+// - each of those three dimensions is a 2d array of block_width by
+// block_height.
+using WedgeMaskArray =
+ std::array<std::array<std::array<Array2D<uint8_t>, 16>, 2>, 9>;
+
+enum GlobalMotionTransformationType : uint8_t {
+ kGlobalMotionTransformationTypeIdentity,
+ kGlobalMotionTransformationTypeTranslation,
+ kGlobalMotionTransformationTypeRotZoom,
+ kGlobalMotionTransformationTypeAffine,
+ kNumGlobalMotionTransformationTypes
+};
+
+// Global motion and warped motion parameters. See the paper for more info:
+// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally
+// adaptive warped motion compensation in video compression", Proc. IEEE
+// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017.
+struct GlobalMotion {
+ GlobalMotionTransformationType type;
+ int32_t params[6];
+
+ // Represent two shearing operations. Computed from |params| by SetupShear().
+ //
+ // The least significant six (= kWarpParamRoundingBits) bits are all zeros.
+ // (This means alpha, beta, gamma, and delta could be represented by a 10-bit
+ // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum
+ // value is 32704 = 0x7fc0, the largest int16_t value whose least significant
+ // six bits are all zeros.
+ //
+ // Valid warp parameters (as validated by SetupShear()) have smaller ranges.
+ // Their absolute values are less than 2^14 (= 16384). (This follows from
+ // the warpValid check at the end of Section 7.11.3.6.)
+ //
+ // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which
+ // is outside the range of int16_t. When cast to int16_t, 32768 becomes
+ // -32768. This potential int16_t overflow does not matter because either
+ // 32768 or -32768 causes SetupShear() to return false,
+ int16_t alpha;
+ int16_t beta;
+ int16_t gamma;
+ int16_t delta;
+};
+
+// Loop filter parameters:
+//
+// If level[0] and level[1] are both equal to 0, the loop filter process is
+// not invoked.
+//
+// |sharpness| and |delta_enabled| are only used by the loop filter process.
+//
+// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop
+// filter process but also by the reference frame update and loading
+// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only
+// when |delta_enabled| is true.
+struct LoopFilter {
+ // Contains loop filter strength values in the range of [0, 63].
+ std::array<int8_t, kFrameLfCount> level;
+ // Indicates the sharpness level in the range of [0, 7].
+ int8_t sharpness;
+ // Whether the filter level depends on the mode and reference frame used to
+ // predict a block.
+ bool delta_enabled;
+ // Whether additional syntax elements were read that specify which mode and
+ // reference frame deltas are to be updated. loop_filter_delta_update field in
+ // Section 5.9.11 of the spec.
+ bool delta_update;
+ // Contains the adjustment needed for the filter level based on the chosen
+ // reference frame, in the range of [-64, 63].
+ std::array<int8_t, kNumReferenceFrameTypes> ref_deltas;
+ // Contains the adjustment needed for the filter level based on the chosen
+ // mode, in the range of [-64, 63].
+ std::array<int8_t, kLoopFilterMaxModeDeltas> mode_deltas;
+};
+
+struct Delta {
+ bool present;
+ uint8_t scale;
+ bool multi;
+};
+
+struct Cdef {
+ uint8_t damping; // damping value from the spec + (bitdepth - 8).
+ uint8_t bits;
+ // All the strength values are the values from the spec and left shifted by
+ // (bitdepth - 8).
+ uint8_t y_primary_strength[kMaxCdefStrengths];
+ uint8_t y_secondary_strength[kMaxCdefStrengths];
+ uint8_t uv_primary_strength[kMaxCdefStrengths];
+ uint8_t uv_secondary_strength[kMaxCdefStrengths];
+};
+
+struct TileInfo {
+ bool uniform_spacing;
+ int sb_rows;
+ int sb_columns;
+ int tile_count;
+ int tile_columns_log2;
+ int tile_columns;
+ int tile_column_start[kMaxTileColumns + 1];
+ // This field is not used by libgav1, but is populated for use by some
+ // hardware decoders. So it must not be removed.
+ int tile_column_width_in_superblocks[kMaxTileColumns + 1];
+ int tile_rows_log2;
+ int tile_rows;
+ int tile_row_start[kMaxTileRows + 1];
+ // This field is not used by libgav1, but is populated for use by some
+ // hardware decoders. So it must not be removed.
+ int tile_row_height_in_superblocks[kMaxTileRows + 1];
+ int16_t context_update_id;
+ uint8_t tile_size_bytes;
+};
+
+struct LoopRestoration {
+ LoopRestorationType type[kMaxPlanes];
+ int unit_size_log2[kMaxPlanes];
+};
+
+// Stores the quantization parameters of Section 5.9.12.
+struct QuantizerParameters {
+ // base_index is in the range [0, 255].
+ uint8_t base_index;
+ int8_t delta_dc[kMaxPlanes];
+ // delta_ac[kPlaneY] is always 0.
+ int8_t delta_ac[kMaxPlanes];
+ bool use_matrix;
+ // The |matrix_level| array is used only when |use_matrix| is true.
+ // matrix_level[plane] specifies the level in the quantizer matrix that
+ // should be used for decoding |plane|. The quantizer matrix has 15 levels,
+ // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If
+ // matrix_level[plane] is 15, the quantizer matrix is not used.
+ int8_t matrix_level[kMaxPlanes];
+};
+
+// The corresponding segment feature constants in the AV1 spec are named
+// SEG_LVL_xxx.
+enum SegmentFeature : uint8_t {
+ kSegmentFeatureQuantizer,
+ kSegmentFeatureLoopFilterYVertical,
+ kSegmentFeatureLoopFilterYHorizontal,
+ kSegmentFeatureLoopFilterU,
+ kSegmentFeatureLoopFilterV,
+ kSegmentFeatureReferenceFrame,
+ kSegmentFeatureSkip,
+ kSegmentFeatureGlobalMv,
+ kSegmentFeatureMax
+};
+
+struct Segmentation {
+ // 5.11.14.
+ // Returns true if the feature is enabled in the segment.
+ bool FeatureActive(int segment_id, SegmentFeature feature) const {
+ return enabled && segment_id < kMaxSegments &&
+ feature_enabled[segment_id][feature];
+ }
+
+ // Returns true if the feature is signed.
+ static bool FeatureSigned(SegmentFeature feature) {
+ // Only the first five segment features are signed, so this comparison
+ // suffices.
+ return feature <= kSegmentFeatureLoopFilterV;
+ }
+
+ bool enabled;
+ bool update_map;
+ bool update_data;
+ bool temporal_update;
+ // True if the segment id will be read before the skip syntax element. False
+ // if the skip syntax element will be read first.
+ bool segment_id_pre_skip;
+ // The highest numbered segment id that has some enabled feature. Used as
+ // the upper bound for decoding segment ids.
+ int8_t last_active_segment_id;
+
+ bool feature_enabled[kMaxSegments][kSegmentFeatureMax];
+ int16_t feature_data[kMaxSegments][kSegmentFeatureMax];
+ bool lossless[kMaxSegments];
+ // Cached values of get_qindex(1, segmentId), to be consumed by
+ // Tile::ReadTransformType(). The values are in the range [0, 255].
+ uint8_t qindex[kMaxSegments];
+};
+
+// Section 6.8.20.
+// Note: In spec, film grain section uses YCbCr to denote variable names,
+// such as num_cb_points, num_cr_points. To keep it consistent with other
+// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc.
+struct FilmGrainParams {
+ bool apply_grain;
+ bool update_grain;
+ bool chroma_scaling_from_luma;
+ bool overlap_flag;
+ bool clip_to_restricted_range;
+
+ uint8_t num_y_points; // [0, 14].
+ uint8_t num_u_points; // [0, 10].
+ uint8_t num_v_points; // [0, 10].
+ // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order.
+ uint8_t point_y_value[14];
+ uint8_t point_y_scaling[14];
+ uint8_t point_u_value[10];
+ uint8_t point_u_scaling[10];
+ uint8_t point_v_value[10];
+ uint8_t point_v_scaling[10];
+
+ uint8_t chroma_scaling; // grain_scaling_minus_8 + 8: [8, 11].
+ uint8_t auto_regression_coeff_lag; // ar_coeff_lag: [0, 3].
+ // ar_coeffs_{y,u,v}_plus_128 - 128: [-128, 127].
+ int8_t auto_regression_coeff_y[24];
+ int8_t auto_regression_coeff_u[25];
+ int8_t auto_regression_coeff_v[25];
+ // Shift value: ar_coeff_shift_minus_6 + 6, auto regression coeffs range:
+ // 6: [-2, 2)
+ // 7: [-1, 1)
+ // 8: [-0.5, 0.5)
+ // 9: [-0.25, 0.25)
+ uint8_t auto_regression_shift;
+
+ uint16_t grain_seed;
+ int reference_index;
+ int grain_scale_shift;
+ int8_t u_multiplier; // cb_mult - 128: [-128, 127].
+ int8_t u_luma_multiplier; // cb_luma_mult - 128: [-128, 127].
+ int16_t u_offset; // cb_offset - 256: [-256, 255].
+ int8_t v_multiplier; // cr_mult - 128: [-128, 127].
+ int8_t v_luma_multiplier; // cr_luma_mult - 128: [-128, 127].
+ int16_t v_offset; // cr_offset - 256: [-256, 255].
+};
+
+struct ObuFrameHeader {
+ uint16_t display_frame_id;
+ uint16_t current_frame_id;
+ int64_t frame_offset;
+ uint16_t expected_frame_id[kNumInterReferenceFrameTypes];
+ int32_t width;
+ int32_t height;
+ int32_t columns4x4;
+ int32_t rows4x4;
+ // The render size (render_width and render_height) is a hint to the
+ // application about the desired display size. It has no effect on the
+ // decoding process.
+ int32_t render_width;
+ int32_t render_height;
+ int32_t upscaled_width;
+ LoopRestoration loop_restoration;
+ uint32_t buffer_removal_time[kMaxOperatingPoints];
+ uint32_t frame_presentation_time;
+ // Note: global_motion[0] (for kReferenceFrameIntra) is not used.
+ std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion;
+ TileInfo tile_info;
+ QuantizerParameters quantizer;
+ Segmentation segmentation;
+ bool show_existing_frame;
+ // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is
+ // true.
+ int8_t frame_to_show;
+ FrameType frame_type;
+ bool show_frame;
+ bool showable_frame;
+ bool error_resilient_mode;
+ bool enable_cdf_update;
+ bool frame_size_override_flag;
+ // The order_hint syntax element in the uncompressed header. If
+ // show_existing_frame is false, the OrderHint variable in the spec is equal
+ // to this field, and so this field can be used in place of OrderHint when
+ // show_existing_frame is known to be false, such as during tile decoding.
+ uint8_t order_hint;
+ int8_t primary_reference_frame;
+ bool render_and_frame_size_different;
+ bool use_superres;
+ uint8_t superres_scale_denominator;
+ bool allow_screen_content_tools;
+ bool allow_intrabc;
+ bool frame_refs_short_signaling;
+ // A bitmask that specifies which reference frame slots will be updated with
+ // the current frame after it is decoded.
+ uint8_t refresh_frame_flags;
+ static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 ==
+ kNumReferenceFrameTypes,
+ "");
+ bool found_reference;
+ int8_t force_integer_mv;
+ bool allow_high_precision_mv;
+ InterpolationFilter interpolation_filter;
+ bool is_motion_mode_switchable;
+ bool use_ref_frame_mvs;
+ bool enable_frame_end_update_cdf;
+ // True if all segments are losslessly encoded at the coded resolution.
+ bool coded_lossless;
+ // True if all segments are losslessly encoded at the upscaled resolution.
+ bool upscaled_lossless;
+ TxMode tx_mode;
+ // True means that the mode info for inter blocks contains the syntax
+ // element comp_mode that indicates whether to use single or compound
+ // prediction. False means that all inter blocks will use single prediction.
+ bool reference_mode_select;
+ // The frames to use for compound prediction when skip_mode is true.
+ ReferenceFrameType skip_mode_frame[2];
+ bool skip_mode_present;
+ bool reduced_tx_set;
+ bool allow_warped_motion;
+ Delta delta_q;
+ Delta delta_lf;
+ // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
+ // indicates an invalid value.
+ //
+ // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+ // kFrameIntraOnly), reference_frame_index is not used and may be
+ // uninitialized.
+ int8_t reference_frame_index[kNumInterReferenceFrameTypes];
+ // The ref_order_hint[ i ] syntax element in the uncompressed header.
+ // Specifies the expected output order hint for each reference frame.
+ uint8_t reference_order_hint[kNumReferenceFrameTypes];
+ LoopFilter loop_filter;
+ Cdef cdef;
+ FilmGrainParams film_grain_params;
+};
+
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+ PartitionTreeNode() = default;
+ PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+ : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+ int row4x4 = -1;
+ int column4x4 = -1;
+ BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+ TransformParameters() = default;
+ TransformParameters(TransformType type, int non_zero_coeff_count)
+ : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+ TransformType type;
+ int non_zero_coeff_count;
+};
+
+} // namespace libgav1
+#endif // LIBGAV1_SRC_UTILS_TYPES_H_
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+// A FIFO queue of an unbounded capacity.
+//
+// This implementation uses the general approach used in std::deque
+// implementations. See, for example,
+// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl
+//
+// It is much simpler because it just needs to support the queue interface.
+// The blocks are chained into a circular list, not managed by a "map". It
+// does not shrink the internal buffer.
+//
+// An alternative implementation approach is a resizable circular array. See,
+// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/
+// and base::circular_deque in Chromium's base/containers library.
+template <typename T>
+class UnboundedQueue {
+ public:
+ UnboundedQueue() = default;
+
+ // Move only.
+ UnboundedQueue(UnboundedQueue&& other)
+ : first_block_(other.first_block_),
+ front_(other.front_),
+ last_block_(other.last_block_),
+ back_(other.back_) {
+ other.first_block_ = nullptr;
+ other.front_ = 0;
+ other.last_block_ = nullptr;
+ other.back_ = 0;
+ }
+ UnboundedQueue& operator=(UnboundedQueue&& other) {
+ if (this != &other) {
+ Destroy();
+ first_block_ = other.first_block_;
+ front_ = other.front_;
+ last_block_ = other.last_block_;
+ back_ = other.back_;
+ other.first_block_ = nullptr;
+ other.front_ = 0;
+ other.last_block_ = nullptr;
+ other.back_ = 0;
+ }
+ return *this;
+ }
+
+ ~UnboundedQueue() { Destroy(); }
+
+ // Allocates two Blocks upfront because most access patterns require at
+ // least two Blocks. Returns false if the allocation of the Blocks failed.
+ LIBGAV1_MUST_USE_RESULT bool Init() {
+ std::unique_ptr<Block> new_block0(new (std::nothrow) Block);
+ std::unique_ptr<Block> new_block1(new (std::nothrow) Block);
+ if (new_block0 == nullptr || new_block1 == nullptr) return false;
+ first_block_ = last_block_ = new_block0.release();
+ new_block1->next = first_block_;
+ last_block_->next = new_block1.release();
+ return true;
+ }
+
+ // Checks if the queue has room for a new element. If the queue is full,
+ // tries to grow it. Returns false if the queue is full and the attempt to
+ // grow it failed.
+ //
+ // NOTE: GrowIfNeeded() must be called before each call to Push(). This
+ // inconvenient design is necessary to guarantee a successful Push() call.
+ //
+ // Push(T&& value) is often called with the argument std::move(value). The
+ // moved-from object |value| won't be usable afterwards, so it would be
+ // problematic if Push(T&& value) failed and we lost access to the original
+ // |value| object.
+ LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() {
+ assert(last_block_ != nullptr);
+ if (back_ == kBlockCapacity) {
+ if (last_block_->next == first_block_) {
+ // All Blocks are in use.
+ std::unique_ptr<Block> new_block(new (std::nothrow) Block);
+ if (new_block == nullptr) return false;
+ new_block->next = first_block_;
+ last_block_->next = new_block.release();
+ }
+ last_block_ = last_block_->next;
+ back_ = 0;
+ }
+ return true;
+ }
+
+ // Pushes the element |value| to the end of the queue. It is an error to call
+ // Push() when the queue is full.
+ void Push(const T& value) {
+ assert(last_block_ != nullptr);
+ assert(back_ < kBlockCapacity);
+ T* elements = reinterpret_cast<T*>(last_block_->buffer);
+ new (&elements[back_++]) T(value);
+ }
+
+ void Push(T&& value) {
+ assert(last_block_ != nullptr);
+ assert(back_ < kBlockCapacity);
+ T* elements = reinterpret_cast<T*>(last_block_->buffer);
+ new (&elements[back_++]) T(std::move(value));
+ }
+
+ // Returns the element at the front of the queue. It is an error to call
+ // Front() when the queue is empty.
+ T& Front() {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ return elements[front_];
+ }
+
+ const T& Front() const {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ return elements[front_];
+ }
+
+ // Removes the element at the front of the queue from the queue. It is an
+ // error to call Pop() when the queue is empty.
+ void Pop() {
+ assert(!Empty());
+ T* elements = reinterpret_cast<T*>(first_block_->buffer);
+ elements[front_++].~T();
+ if (front_ == kBlockCapacity) {
+ // The first block has become empty.
+ front_ = 0;
+ if (first_block_ == last_block_) {
+ // Only one Block is in use. Simply reset back_.
+ back_ = 0;
+ } else {
+ first_block_ = first_block_->next;
+ }
+ }
+ }
+
+ // Returns true if the queue is empty.
+ bool Empty() const { return first_block_ == last_block_ && front_ == back_; }
+
+ private:
+ // kBlockCapacity is the maximum number of elements each Block can hold.
+ // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in
+ // the Block struct.
+ //
+ // In Linux x86_64, sizeof(std::function<void()>) is 32, so each Block can
+ // hold 63 std::function<void()> objects.
+ //
+ // NOTE: The corresponding value in <deque> in libc++ revision
+ // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is:
+ // template <class _ValueType, class _DiffType>
+ // struct __deque_block_size {
+ // static const _DiffType value =
+ // sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16;
+ // };
+ //
+ // Note that 4096 / 256 = 16, so apparently this expression is intended to
+ // ensure the block size is at least 4096 bytes and each block can hold at
+ // least 16 elements.
+ static constexpr size_t kBlockCapacity =
+ (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16;
+
+ struct Block : public Allocable {
+ alignas(T) char buffer[kBlockCapacity * sizeof(T)];
+ Block* next;
+ };
+
+ void Destroy() {
+ if (first_block_ == nullptr) return; // An uninitialized queue.
+
+ // First free the unused blocks, which are located after last_block and
+ // before first_block_.
+ Block* block = last_block_->next;
+ // Cut the circular list open after last_block_.
+ last_block_->next = nullptr;
+ while (block != first_block_) {
+ Block* next = block->next;
+ delete block;
+ block = next;
+ }
+
+ // Then free the used blocks. Destruct the elements in the used blocks.
+ while (block != nullptr) {
+ const size_t begin = (block == first_block_) ? front_ : 0;
+ const size_t end = (block == last_block_) ? back_ : kBlockCapacity;
+ T* elements = reinterpret_cast<T*>(block->buffer);
+ for (size_t i = begin; i < end; ++i) {
+ elements[i].~T();
+ }
+ Block* next = block->next;
+ delete block;
+ block = next;
+ }
+ }
+
+ // Blocks are chained in a circular singly-linked list. If the list of Blocks
+ // is empty, both first_block_ and last_block_ are null pointers. If the list
+ // is nonempty, first_block_ points to the first used Block and last_block_
+ // points to the last used Block.
+ //
+ // Invariant: If Init() is called and succeeds, the queue is always nonempty.
+ // This allows all methods (except the destructor) to avoid null pointer
+ // checks for first_block_ and last_block_.
+ Block* first_block_ = nullptr;
+ // The index of the element in first_block_ to be removed by Pop().
+ size_t front_ = 0;
+ Block* last_block_ = nullptr;
+ // The index in last_block_ where the new element is inserted by Push().
+ size_t back_ = 0;
+};
+
+#if !LIBGAV1_CXX17
+template <typename T>
+constexpr size_t UnboundedQueue<T>::kBlockCapacity;
+#endif
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/unbounded_queue.h"
+
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class Integer {
+ public:
+ explicit Integer(int value) : value_(new (std::nothrow) int{value}) {}
+
+ // Move only.
+ Integer(Integer&& other) : value_(other.value_) { other.value_ = nullptr; }
+ Integer& operator=(Integer&& other) {
+ if (this != &other) {
+ delete value_;
+ value_ = other.value_;
+ other.value_ = nullptr;
+ }
+ return *this;
+ }
+
+ ~Integer() { delete value_; }
+
+ int value() const { return *value_; }
+
+ private:
+ int* value_;
+};
+
+TEST(UnboundedQueueTest, Basic) {
+ UnboundedQueue<int> queue;
+ ASSERT_TRUE(queue.Init());
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_TRUE(queue.GrowIfNeeded());
+ queue.Push(i);
+ EXPECT_FALSE(queue.Empty());
+ }
+
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_FALSE(queue.Empty());
+ EXPECT_EQ(queue.Front(), i);
+ queue.Pop();
+ }
+ EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, WrapAround) {
+ UnboundedQueue<int> queue;
+ ASSERT_TRUE(queue.Init());
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 1000; ++i) {
+ EXPECT_TRUE(queue.GrowIfNeeded());
+ queue.Push(i);
+ EXPECT_FALSE(queue.Empty());
+ EXPECT_EQ(queue.Front(), i);
+ queue.Pop();
+ EXPECT_TRUE(queue.Empty());
+ }
+}
+
+TEST(UnboundedQueueTest, EmptyBeforeInit) {
+ UnboundedQueue<int> queue;
+ EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, LotsOfElements) {
+ UnboundedQueue<Integer> queue;
+ ASSERT_TRUE(queue.Init());
+ EXPECT_TRUE(queue.Empty());
+
+ for (int i = 0; i < 10000; ++i) {
+ Integer integer(i);
+ EXPECT_EQ(integer.value(), i);
+ EXPECT_TRUE(queue.GrowIfNeeded());
+ queue.Push(std::move(integer));
+ EXPECT_FALSE(queue.Empty());
+ }
+
+ for (int i = 0; i < 5000; ++i) {
+ EXPECT_FALSE(queue.Empty());
+ const Integer& integer = queue.Front();
+ EXPECT_EQ(integer.value(), i);
+ queue.Pop();
+ }
+ // Leave some elements in the queue to test destroying a nonempty queue.
+ EXPECT_FALSE(queue.Empty());
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(UnboundedQueueTest, Move) {
+ UnboundedQueue<int> ints1;
+ ASSERT_TRUE(ints1.Init());
+ EXPECT_TRUE(ints1.GrowIfNeeded());
+ ints1.Push(2);
+ EXPECT_TRUE(ints1.GrowIfNeeded());
+ ints1.Push(3);
+ EXPECT_TRUE(ints1.GrowIfNeeded());
+ ints1.Push(5);
+ EXPECT_TRUE(ints1.GrowIfNeeded());
+ ints1.Push(7);
+
+ // Move constructor.
+ UnboundedQueue<int> ints2(std::move(ints1));
+ EXPECT_EQ(ints2.Front(), 2);
+ ints2.Pop();
+ EXPECT_EQ(ints2.Front(), 3);
+ ints2.Pop();
+ EXPECT_EQ(ints2.Front(), 5);
+ ints2.Pop();
+ EXPECT_EQ(ints2.Front(), 7);
+ ints2.Pop();
+ EXPECT_TRUE(ints2.Empty());
+
+ EXPECT_TRUE(ints2.GrowIfNeeded());
+ ints2.Push(11);
+ EXPECT_TRUE(ints2.GrowIfNeeded());
+ ints2.Push(13);
+ EXPECT_TRUE(ints2.GrowIfNeeded());
+ ints2.Push(17);
+ EXPECT_TRUE(ints2.GrowIfNeeded());
+ ints2.Push(19);
+
+ // Move assignment.
+ UnboundedQueue<int> ints3;
+ ASSERT_TRUE(ints3.Init());
+ EXPECT_TRUE(ints3.GrowIfNeeded());
+ ints3.Push(23);
+ ints3 = std::move(ints2);
+ EXPECT_EQ(ints3.Front(), 11);
+ ints3.Pop();
+ EXPECT_EQ(ints3.Front(), 13);
+ ints3.Pop();
+ EXPECT_EQ(ints3.Front(), 17);
+ ints3.Pop();
+ EXPECT_EQ(ints3.Front(), 19);
+ ints3.Pop();
+ EXPECT_TRUE(ints3.Empty());
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// libgav1::Vector implementation
+
+#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_
+#define LIBGAV1_SRC_UTILS_VECTOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace internal {
+
+static constexpr size_t kMinVectorAllocation = 16;
+
+// Returns the smallest power of two greater or equal to 'value'.
+inline size_t NextPow2(size_t value) {
+ if (value == 0) return 0;
+ --value;
+ for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i;
+ return value + 1;
+}
+
+// Returns the smallest capacity greater or equal to 'value'.
+inline size_t NextCapacity(size_t value) {
+ if (value == 0) return 0;
+ if (value <= kMinVectorAllocation) return kMinVectorAllocation;
+ return NextPow2(value);
+}
+
+//------------------------------------------------------------------------------
+// Data structure equivalent to std::vector but returning false and to its last
+// valid state on memory allocation failure.
+// std::vector with a custom allocator does not fill this need without
+// exceptions.
+
+template <typename T>
+class VectorBase {
+ public:
+ using iterator = T*;
+ using const_iterator = const T*;
+
+ VectorBase() noexcept = default;
+ // Move only.
+ VectorBase(const VectorBase&) = delete;
+ VectorBase& operator=(const VectorBase&) = delete;
+ VectorBase(VectorBase&& other) noexcept
+ : items_(other.items_),
+ capacity_(other.capacity_),
+ num_items_(other.num_items_) {
+ other.items_ = nullptr;
+ other.capacity_ = 0;
+ other.num_items_ = 0;
+ }
+ VectorBase& operator=(VectorBase&& other) noexcept {
+ if (this != &other) {
+ clear();
+ free(items_);
+ items_ = other.items_;
+ capacity_ = other.capacity_;
+ num_items_ = other.num_items_;
+ other.items_ = nullptr;
+ other.capacity_ = 0;
+ other.num_items_ = 0;
+ }
+ return *this;
+ }
+ ~VectorBase() {
+ clear();
+ free(items_);
+ }
+
+ // Reallocates just enough memory if needed so that 'new_cap' items can fit.
+ LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) {
+ if (capacity_ < new_cap) {
+ T* const new_items = static_cast<T*>(malloc(new_cap * sizeof(T)));
+ if (new_items == nullptr) return false;
+ if (num_items_ > 0) {
+ if (std::is_trivial<T>::value) {
+ // Cast |new_items| and |items_| to void* to avoid the GCC
+ // -Wclass-memaccess warning and additionally the
+ // bugprone-undefined-memory-manipulation clang-tidy warning. The
+ // memcpy is safe because T is a trivial type.
+ memcpy(static_cast<void*>(new_items),
+ static_cast<const void*>(items_), num_items_ * sizeof(T));
+ } else {
+ for (size_t i = 0; i < num_items_; ++i) {
+ new (&new_items[i]) T(std::move(items_[i]));
+ items_[i].~T();
+ }
+ }
+ }
+ free(items_);
+ items_ = new_items;
+ capacity_ = new_cap;
+ }
+ return true;
+ }
+
+ // Reallocates less memory so that only the existing items can fit.
+ bool shrink_to_fit() {
+ if (capacity_ == num_items_) return true;
+ if (num_items_ == 0) {
+ free(items_);
+ items_ = nullptr;
+ capacity_ = 0;
+ return true;
+ }
+ const size_t previous_capacity = capacity_;
+ capacity_ = 0; // Force reserve() to allocate and copy.
+ if (reserve(num_items_)) return true;
+ capacity_ = previous_capacity;
+ return false;
+ }
+
+ // Constructs a new item by copy constructor. May reallocate if
+ // 'resize_if_needed'.
+ LIBGAV1_MUST_USE_RESULT bool push_back(const T& value,
+ bool resize_if_needed = true) {
+ if (num_items_ >= capacity_ &&
+ (!resize_if_needed ||
+ !reserve(internal::NextCapacity(num_items_ + 1)))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(value);
+ ++num_items_;
+ return true;
+ }
+
+ // Constructs a new item by copy constructor. reserve() must have been called
+ // with a sufficient capacity.
+ //
+ // WARNING: No error checking is performed.
+ void push_back_unchecked(const T& value) {
+ assert(num_items_ < capacity_);
+ new (&items_[num_items_]) T(value);
+ ++num_items_;
+ }
+
+ // Constructs a new item by move constructor. May reallocate if
+ // 'resize_if_needed'.
+ LIBGAV1_MUST_USE_RESULT bool push_back(T&& value,
+ bool resize_if_needed = true) {
+ if (num_items_ >= capacity_ &&
+ (!resize_if_needed ||
+ !reserve(internal::NextCapacity(num_items_ + 1)))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(std::move(value));
+ ++num_items_;
+ return true;
+ }
+
+ // Constructs a new item by move constructor. reserve() must have been called
+ // with a sufficient capacity.
+ //
+ // WARNING: No error checking is performed.
+ void push_back_unchecked(T&& value) {
+ assert(num_items_ < capacity_);
+ new (&items_[num_items_]) T(std::move(value));
+ ++num_items_;
+ }
+
+ // Constructs a new item in place by forwarding the arguments args... to the
+ // constructor. May reallocate.
+ template <typename... Args>
+ LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) {
+ if (num_items_ >= capacity_ &&
+ !reserve(internal::NextCapacity(num_items_ + 1))) {
+ return false;
+ }
+ new (&items_[num_items_]) T(std::forward<Args>(args)...);
+ ++num_items_;
+ return true;
+ }
+
+ // Destructs the last item.
+ void pop_back() {
+ --num_items_;
+ items_[num_items_].~T();
+ }
+
+ // Destructs the item at 'pos'.
+ void erase(iterator pos) { erase(pos, pos + 1); }
+
+ // Destructs the items in [first,last).
+ void erase(iterator first, iterator last) {
+ for (iterator it = first; it != last; ++it) it->~T();
+ if (last != end()) {
+ if (std::is_trivial<T>::value) {
+ // Cast |first| and |last| to void* to avoid the GCC
+ // -Wclass-memaccess warning and additionally the
+ // bugprone-undefined-memory-manipulation clang-tidy warning. The
+ // memmove is safe because T is a trivial type.
+ memmove(static_cast<void*>(first), static_cast<const void*>(last),
+ (end() - last) * sizeof(T));
+ } else {
+ for (iterator it_src = last, it_dst = first; it_src != end();
+ ++it_src, ++it_dst) {
+ new (it_dst) T(std::move(*it_src));
+ it_src->~T();
+ }
+ }
+ }
+ num_items_ -= std::distance(first, last);
+ }
+
+ // Destructs all the items.
+ void clear() { erase(begin(), end()); }
+
+ // Destroys (including deallocating) all the items.
+ void reset() {
+ clear();
+ if (!shrink_to_fit()) assert(false);
+ }
+
+ // Accessors
+ bool empty() const { return (num_items_ == 0); }
+ size_t size() const { return num_items_; }
+ size_t capacity() const { return capacity_; }
+
+ T* data() { return items_; }
+ T& front() { return items_[0]; }
+ T& back() { return items_[num_items_ - 1]; }
+ T& operator[](size_t i) { return items_[i]; }
+ T& at(size_t i) { return items_[i]; }
+ const T* data() const { return items_; }
+ const T& front() const { return items_[0]; }
+ const T& back() const { return items_[num_items_ - 1]; }
+ const T& operator[](size_t i) const { return items_[i]; }
+ const T& at(size_t i) const { return items_[i]; }
+
+ iterator begin() { return &items_[0]; }
+ const_iterator begin() const { return &items_[0]; }
+ iterator end() { return &items_[num_items_]; }
+ const_iterator end() const { return &items_[num_items_]; }
+
+ void swap(VectorBase& b) {
+ // Although not necessary here, adding "using std::swap;" and then calling
+ // swap() without namespace qualification is recommended. See Effective
+ // C++, Item 25.
+ using std::swap;
+ swap(items_, b.items_);
+ swap(capacity_, b.capacity_);
+ swap(num_items_, b.num_items_);
+ }
+
+ protected:
+ T* items_ = nullptr;
+ size_t capacity_ = 0;
+ size_t num_items_ = 0;
+};
+
+} // namespace internal
+
+//------------------------------------------------------------------------------
+
+// Vector class that does *NOT* construct the content on resize().
+// Should be reserved to plain old data.
+template <typename T>
+class VectorNoCtor : public internal::VectorBase<T> {
+ public:
+ // Creates or destructs items so that 'new_num_items' exist.
+ // Allocated memory grows every power-of-two items.
+ LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+ using super = internal::VectorBase<T>;
+ if (super::num_items_ < new_num_items) {
+ if (super::capacity_ < new_num_items) {
+ if (!super::reserve(internal::NextCapacity(new_num_items))) {
+ return false;
+ }
+ }
+ super::num_items_ = new_num_items;
+ } else {
+ while (super::num_items_ > new_num_items) {
+ --super::num_items_;
+ super::items_[super::num_items_].~T();
+ }
+ }
+ return true;
+ }
+};
+
+// This generic vector class will call the constructors.
+template <typename T>
+class Vector : public internal::VectorBase<T> {
+ public:
+ // Constructs or destructs items so that 'new_num_items' exist.
+ // Allocated memory grows every power-of-two items.
+ LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+ using super = internal::VectorBase<T>;
+ if (super::num_items_ < new_num_items) {
+ if (super::capacity_ < new_num_items) {
+ if (!super::reserve(internal::NextCapacity(new_num_items))) {
+ return false;
+ }
+ }
+ while (super::num_items_ < new_num_items) {
+ new (&super::items_[super::num_items_]) T();
+ ++super::num_items_;
+ }
+ } else {
+ while (super::num_items_ > new_num_items) {
+ --super::num_items_;
+ super::items_[super::num_items_].~T();
+ }
+ }
+ return true;
+ }
+};
+
+//------------------------------------------------------------------------------
+
+// Define non-member swap() functions in the namespace in which VectorNoCtor
+// and Vector are implemented. See Effective C++, Item 25.
+
+template <typename T>
+void swap(VectorNoCtor<T>& a, VectorNoCtor<T>& b) {
+ a.swap(b);
+}
+
+template <typename T>
+void swap(Vector<T>& a, Vector<T>& b) {
+ a.swap(b);
+}
+
+//------------------------------------------------------------------------------
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_UTILS_VECTOR_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/vector.h"
+
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+class Foo {
+ public:
+ Foo() = default;
+
+ int x() const { return x_; }
+
+ private:
+ int x_ = 38;
+};
+
+class Point {
+ public:
+ Point(int x, int y) : x_(x), y_(y) {}
+
+ int x() const { return x_; }
+ int y() const { return y_; }
+
+ private:
+ int x_;
+ int y_;
+};
+
+TEST(VectorTest, NoCtor) {
+ VectorNoCtor<int> v;
+ EXPECT_TRUE(v.resize(100));
+ Vector<int> w;
+ EXPECT_TRUE(w.resize(100));
+
+#if LIBGAV1_MSAN
+ // Use MemorySanitizer to check VectorNoCtor::resize() does not initialize
+ // the memory while Vector::resize() does.
+ //
+ // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+ // first (at least partially) poisoned byte in the range, or -1 if the whole
+ // range is good.
+ for (size_t i = 0; i < 100; ++i) {
+ EXPECT_EQ(__msan_test_shadow(&v[i], sizeof(int)), 0);
+ EXPECT_EQ(__msan_test_shadow(&w[i], sizeof(int)), -1);
+ }
+#endif
+}
+
+TEST(VectorTest, Constructor) {
+ Vector<Foo> v;
+ EXPECT_TRUE(v.resize(100));
+ for (const Foo& foo : v) {
+ EXPECT_EQ(foo.x(), 38);
+ }
+}
+
+TEST(VectorTest, PushBack) {
+ // Create a vector containing integers
+ Vector<int> v;
+ EXPECT_TRUE(v.reserve(8));
+ EXPECT_EQ(v.size(), 0);
+
+ EXPECT_TRUE(v.push_back(25));
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(v[0], 25);
+
+ EXPECT_TRUE(v.push_back(13));
+ EXPECT_EQ(v.size(), 2);
+ EXPECT_EQ(v[0], 25);
+ EXPECT_EQ(v[1], 13);
+}
+
+TEST(VectorTest, PushBackUnchecked) {
+ Vector<std::unique_ptr<Point>> v;
+ EXPECT_TRUE(v.reserve(2));
+ EXPECT_EQ(v.size(), 0);
+
+ std::unique_ptr<Point> point(new (std::nothrow) Point(1, 2));
+ EXPECT_NE(point, nullptr);
+ v.push_back_unchecked(std::move(point));
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(v[0]->x(), 1);
+ EXPECT_EQ(v[0]->y(), 2);
+
+ point.reset(new (std::nothrow) Point(3, 4));
+ EXPECT_NE(point, nullptr);
+ v.push_back_unchecked(std::move(point));
+ EXPECT_EQ(v.size(), 2);
+ EXPECT_EQ(v[0]->x(), 1);
+ EXPECT_EQ(v[0]->y(), 2);
+ EXPECT_EQ(v[1]->x(), 3);
+ EXPECT_EQ(v[1]->y(), 4);
+}
+
+TEST(VectorTest, EmplaceBack) {
+ Vector<Point> v;
+ EXPECT_EQ(v.size(), 0);
+
+ EXPECT_TRUE(v.emplace_back(1, 2));
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(v[0].x(), 1);
+ EXPECT_EQ(v[0].y(), 2);
+
+ EXPECT_TRUE(v.emplace_back(3, 4));
+ EXPECT_EQ(v.size(), 2);
+ EXPECT_EQ(v[0].x(), 1);
+ EXPECT_EQ(v[0].y(), 2);
+ EXPECT_EQ(v[1].x(), 3);
+ EXPECT_EQ(v[1].y(), 4);
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(VectorTest, Move) {
+ Vector<int> ints1;
+ EXPECT_TRUE(ints1.reserve(4));
+ EXPECT_TRUE(ints1.push_back(2));
+ EXPECT_TRUE(ints1.push_back(3));
+ EXPECT_TRUE(ints1.push_back(5));
+ EXPECT_TRUE(ints1.push_back(7));
+
+ // Move constructor.
+ Vector<int> ints2(std::move(ints1));
+ EXPECT_EQ(ints2.size(), 4);
+ EXPECT_EQ(ints2[0], 2);
+ EXPECT_EQ(ints2[1], 3);
+ EXPECT_EQ(ints2[2], 5);
+ EXPECT_EQ(ints2[3], 7);
+
+ // Move assignment.
+ Vector<int> ints3;
+ EXPECT_TRUE(ints3.reserve(1));
+ EXPECT_TRUE(ints3.push_back(11));
+ ints3 = std::move(ints2);
+ EXPECT_EQ(ints3.size(), 4);
+ EXPECT_EQ(ints3[0], 2);
+ EXPECT_EQ(ints3[1], 3);
+ EXPECT_EQ(ints3[2], 5);
+ EXPECT_EQ(ints3[3], 7);
+}
+
+TEST(VectorTest, Erase) {
+ Vector<int> ints;
+ EXPECT_TRUE(ints.reserve(4));
+ EXPECT_TRUE(ints.push_back(2));
+ EXPECT_TRUE(ints.push_back(3));
+ EXPECT_TRUE(ints.push_back(5));
+ EXPECT_TRUE(ints.push_back(7));
+
+ EXPECT_EQ(ints.size(), 4);
+ EXPECT_EQ(ints[0], 2);
+ EXPECT_EQ(ints[1], 3);
+ EXPECT_EQ(ints[2], 5);
+ EXPECT_EQ(ints[3], 7);
+
+ ints.erase(ints.begin());
+ EXPECT_EQ(ints.size(), 3);
+ EXPECT_EQ(ints[0], 3);
+ EXPECT_EQ(ints[1], 5);
+ EXPECT_EQ(ints[2], 7);
+}
+
+TEST(VectorTest, EraseNonTrivial) {
+ // A simple class that sets an int value to 0 in the destructor.
+ class Cleaner {
+ public:
+ explicit Cleaner(int* value) : value_(value) {}
+ ~Cleaner() { *value_ = 0; }
+
+ int value() const { return *value_; }
+
+ private:
+ int* value_;
+ };
+ int value1 = 100;
+ int value2 = 200;
+ Vector<std::unique_ptr<Cleaner>> v;
+ EXPECT_TRUE(v.reserve(2));
+ EXPECT_EQ(v.capacity(), 2);
+
+ std::unique_ptr<Cleaner> c(new (std::nothrow) Cleaner(&value1));
+ EXPECT_NE(c, nullptr);
+ EXPECT_TRUE(v.push_back(std::move(c)));
+ c.reset(new (std::nothrow) Cleaner(&value2));
+ EXPECT_NE(c, nullptr);
+ EXPECT_TRUE(v.push_back(std::move(c)));
+ EXPECT_EQ(v.size(), 2);
+ EXPECT_EQ(value1, 100);
+ EXPECT_EQ(value2, 200);
+
+ v.erase(v.begin());
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(v.capacity(), 2);
+ EXPECT_EQ(value1, 0);
+ EXPECT_EQ(value2, 200);
+ EXPECT_EQ(v[0].get()->value(), value2);
+
+ EXPECT_TRUE(v.shrink_to_fit());
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(v.capacity(), 1);
+ EXPECT_EQ(value2, 200);
+ EXPECT_EQ(v[0].get()->value(), value2);
+
+ v.clear();
+ EXPECT_TRUE(v.empty());
+ EXPECT_EQ(value2, 0);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#define LIBGAV1_TOSTRING(x) #x
+#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x)
+#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p
+#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p)
+#define LIBGAV1_DOT_VERSION \
+ LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \
+ LIBGAV1_PATCH_VERSION)
+
+#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION)
+
+extern "C" {
+
+int Libgav1GetVersion() { return LIBGAV1_VERSION; }
+const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; }
+
+const char* Libgav1GetBuildConfiguration() {
+ // TODO(jzern): cmake can generate the detail or in other cases we could
+ // produce one based on the known defines along with the defaults based on
+ // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h.
+ return "Not available.";
+}
+
+} // extern "C"
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#include <regex> // NOLINT (unapproved c++11 header)
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(VersionTest, GetVersion) {
+ const int library_version = GetVersion();
+ EXPECT_EQ((library_version >> 24) & 0xff, 0);
+ // Note if we link against a shared object there's potential for a mismatch
+ // if a different library is loaded at runtime.
+ EXPECT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+ EXPECT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+ EXPECT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+ const int header_version = LIBGAV1_VERSION;
+ EXPECT_EQ((header_version >> 24) & 0xff, 0);
+ EXPECT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+ EXPECT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+ EXPECT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+TEST(VersionTest, GetVersionString) {
+ const char* version = GetVersionString();
+ ASSERT_NE(version, nullptr);
+ // https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
+ const std::regex semver_regex(
+ R"(^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*))"
+ R"((?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))"
+ R"((?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?)"
+ R"((?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$)");
+
+ EXPECT_TRUE(std::regex_match(version, semver_regex)) << version;
+ // Regex validation:
+ // It shouldn't accept a version starting with a non-digit.
+ version = "v1.2.3";
+ EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+ // It shouldn't accept a version with spaces."
+ version = "1.2.3 alpha";
+ EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+}
+
+TEST(VersionTest, GetBuildConfiguration) {
+ const char* config = GetBuildConfiguration();
+ ASSERT_NE(config, nullptr);
+}
+
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWarpModelTranslationClamp = 1 << 23;
+constexpr int kWarpModelAffineClamp = 1 << 13;
+constexpr int kLargestMotionVectorDiff = 256;
+
+constexpr uint16_t kDivisorLookup[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192};
+
+// Number of fractional bits of lookup in divisor lookup table.
+constexpr int kDivisorLookupBits = 8;
+// Number of fractional bits of entries in divisor lookup table.
+constexpr int kDivisorLookupPrecisionBits = 14;
+
+// 7.11.3.7.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+ int16_t* division_shift) {
+ const int n = FloorLog2(std::abs(value));
+ const T e = std::abs(value) - (static_cast<T>(1) << n);
+ const int entry = (n > kDivisorLookupBits)
+ ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+ : static_cast<int>(e << (kDivisorLookupBits - n));
+ *division_shift = n + kDivisorLookupPrecisionBits;
+ *division_factor =
+ (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// 7.11.3.8.
+int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; }
+
+// 7.11.3.8.
+int DiagonalClamp(int32_t value) {
+ return Clip3(value,
+ (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1,
+ (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1);
+}
+
+// 7.11.3.8.
+int NonDiagonalClamp(int32_t value) {
+ return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1);
+}
+
+int16_t GetShearParameter(int value) {
+ return static_cast<int16_t>(
+ LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX),
+ kWarpParamRoundingBits),
+ kWarpParamRoundingBits));
+}
+
+} // namespace
+
+bool SetupShear(GlobalMotion* const warp_params) {
+ int16_t division_shift;
+ int16_t division_factor;
+ const auto* const params = warp_params->params;
+ GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+ &division_shift);
+ const int alpha = params[2] - (1 << kWarpedModelPrecisionBits);
+ const int beta = params[3];
+ const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+ const int gamma =
+ RightShiftWithRoundingSigned(v * division_factor, division_shift);
+ const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+ const int delta =
+ params[5] -
+ RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+ (1 << kWarpedModelPrecisionBits);
+
+ warp_params->alpha = GetShearParameter(alpha);
+ warp_params->beta = GetShearParameter(beta);
+ warp_params->gamma = GetShearParameter(gamma);
+ warp_params->delta = GetShearParameter(delta);
+ if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >=
+ (1 << kWarpedModelPrecisionBits)) ||
+ (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >=
+ (1 << kWarpedModelPrecisionBits))) {
+ return false; // NOLINT (easier condition to understand).
+ }
+
+ return true;
+}
+
+bool WarpEstimation(const int num_samples, const int block_width4x4,
+ const int block_height4x4, const int row4x4,
+ const int column4x4, const MotionVector& mv,
+ const int candidates[kMaxLeastSquaresSamples][4],
+ GlobalMotion* const warp_params) {
+ // |a| fits into int32_t. To avoid cast to int64_t in the following
+ // computation, we declare |a| as int64_t.
+ int64_t a[2][2] = {};
+ int bx[2] = {};
+ int by[2] = {};
+
+ // Note: for simplicity, the spec always uses absolute coordinates
+ // in the warp estimation process. subpixel_mid_x, subpixel_mid_y,
+ // and candidates are relative to the top left of the frame.
+ // In contrast, libaom uses a mixture of coordinate systems.
+ // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative
+ // to the top left of the block.
+ // mid_y/mid_x: the row/column coordinate of the center of the block.
+ const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1;
+ const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1;
+ const int subpixel_mid_y = MultiplyBy8(mid_y);
+ const int subpixel_mid_x = MultiplyBy8(mid_x);
+ const int reference_subpixel_mid_y = subpixel_mid_y + mv.mv[0];
+ const int reference_subpixel_mid_x = subpixel_mid_x + mv.mv[1];
+
+ for (int i = 0; i < num_samples; ++i) {
+ // candidates[][0] and candidates[][1] are the row/column coordinates of the
+ // sample point in this block, to the top left of the frame.
+ // candidates[][2] and candidates[][3] are the row/column coordinates of the
+ // sample point in this reference block, to the top left of the frame.
+ // sy/sx: the row/column coordinates of the sample point, with center of
+ // the block as origin.
+ const int sy = candidates[i][0] - subpixel_mid_y;
+ const int sx = candidates[i][1] - subpixel_mid_x;
+ // dy/dx: the row/column coordinates of the sample point in the reference
+ // block, with center of the reference block as origin.
+ const int dy = candidates[i][2] - reference_subpixel_mid_y;
+ const int dx = candidates[i][3] - reference_subpixel_mid_x;
+ if (std::abs(sx - dx) < kLargestMotionVectorDiff &&
+ std::abs(sy - dy) < kLargestMotionVectorDiff) {
+ a[0][0] += LeastSquareProduct(sx, sx) + 8;
+ a[0][1] += LeastSquareProduct(sx, sy) + 4;
+ a[1][1] += LeastSquareProduct(sy, sy) + 8;
+ bx[0] += LeastSquareProduct(sx, dx) + 8;
+ bx[1] += LeastSquareProduct(sy, dx) + 4;
+ by[0] += LeastSquareProduct(sx, dy) + 4;
+ by[1] += LeastSquareProduct(sy, dy) + 8;
+ }
+ }
+
+ // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to
+ // compute a[1][0].
+ const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1];
+ if (determinant == 0) return false;
+
+ int16_t division_shift;
+ int16_t division_factor;
+ GenerateApproximateDivisor<int64_t>(determinant, &division_factor,
+ &division_shift);
+
+ division_shift -= kWarpedModelPrecisionBits;
+
+ const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1];
+ const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1];
+ const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1];
+ const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1];
+ auto* const params = warp_params->params;
+
+ if (division_shift <= 0) {
+ division_factor <<= -division_shift;
+ params[2] = static_cast<int32_t>(params_2) * division_factor;
+ params[3] = static_cast<int32_t>(params_3) * division_factor;
+ params[4] = static_cast<int32_t>(params_4) * division_factor;
+ params[5] = static_cast<int32_t>(params_5) * division_factor;
+ } else {
+ params[2] = RightShiftWithRoundingSigned(params_2 * division_factor,
+ division_shift);
+ params[3] = RightShiftWithRoundingSigned(params_3 * division_factor,
+ division_shift);
+ params[4] = RightShiftWithRoundingSigned(params_4 * division_factor,
+ division_shift);
+ params[5] = RightShiftWithRoundingSigned(params_5 * division_factor,
+ division_shift);
+ }
+
+ params[2] = DiagonalClamp(params[2]);
+ params[3] = NonDiagonalClamp(params[3]);
+ params[4] = NonDiagonalClamp(params[4]);
+ params[5] = DiagonalClamp(params[5]);
+
+ const int vx = mv.mv[1] * (1 << (kWarpedModelPrecisionBits - 3)) -
+ (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) +
+ mid_y * params[3]);
+ const int vy = mv.mv[0] * (1 << (kWarpedModelPrecisionBits - 3)) -
+ (mid_x * params[4] +
+ mid_y * (params[5] - (1 << kWarpedModelPrecisionBits)));
+ params[0] =
+ Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+ params[1] =
+ Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+ return true;
+}
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_
+#define LIBGAV1_SRC_WARP_PREDICTION_H_
+
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Sets the alpha, beta, gamma, delta fields in warp_params using the
+// warp_params->params array as input (only array entries at indexes 2, 3, 4,
+// 5 are used). Returns whether alpha, beta, gamma, delta are valid.
+bool SetupShear(GlobalMotion* warp_params); // 7.11.3.6.
+
+// Computes local warp parameters by performing a least square fit.
+// Returns whether the computed parameters are valid.
+bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4,
+ int row4x4, int column4x4, const MotionVector& mv,
+ const int candidates[kMaxLeastSquaresSamples][4],
+ GlobalMotion* warp_params); // 7.11.3.8.
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_WARP_PREDICTION_H_
--- /dev/null
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int16_t kExpectedWarpParamsOutput[10][4] = {
+ {0, 0, 0, 0},
+ {2880, 2880, 2752, 2752},
+ {-1408, -1408, -1472, -1472},
+ {0, 0, 0, 0},
+ {6784, 6784, 6144, 6144}, // Invalid.
+ {-5312, -5312, -5824, -5824},
+ {-3904, -3904, -4160, -4160},
+ {2496, 2496, 2368, 2368},
+ {1024, 1024, 1024, 1024},
+ {-7808, -7808, -8832, -8832}, // Invalid.
+};
+
+constexpr bool kExpectedWarpValid[10] = {
+ true, true, true, true, false, true, true, true, true, false,
+};
+
+int RandomWarpedParam(int seed_offset, int bits) {
+ libvpx_test::ACMRandom rnd(seed_offset +
+ libvpx_test::ACMRandom::DeterministicSeed());
+ // 1 in 8 chance of generating zero (arbitrary).
+ const bool zero = (rnd.Rand16() & 7) == 0;
+ if (zero) return 0;
+ // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 << bits].
+ const int mask = (1 << bits) - 1;
+ const int value = 1 + (rnd.RandRange(1U << 31) & mask);
+ const bool sign = (rnd.Rand16() & 1) != 0;
+ return sign ? value : -value;
+}
+
+void GenerateWarpedModel(GlobalMotion* warp_params, int seed) {
+ do {
+ warp_params->params[0] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ warp_params->params[1] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ warp_params->params[2] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ warp_params->params[3] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ warp_params->params[4] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ warp_params->params[5] =
+ RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ } while (warp_params->params[2] == 0);
+}
+
+TEST(WarpPredictionTest, SetupShear) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(kExpectedWarpParamsOutput); ++i) {
+ GlobalMotion warp_params;
+ GenerateWarpedModel(&warp_params, static_cast<int>(i));
+ const bool warp_valid = SetupShear(&warp_params);
+
+ SCOPED_TRACE(testing::Message() << "Test failure at iteration: " << i);
+ EXPECT_EQ(warp_valid, kExpectedWarpValid[i]);
+ EXPECT_EQ(warp_params.alpha, kExpectedWarpParamsOutput[i][0]);
+ EXPECT_EQ(warp_params.beta, kExpectedWarpParamsOutput[i][1]);
+ EXPECT_EQ(warp_params.gamma, kExpectedWarpParamsOutput[i][2]);
+ EXPECT_EQ(warp_params.delta, kExpectedWarpParamsOutput[i][3]);
+ }
+
+ // Test signed shift behavior in delta and gamma generation.
+ GlobalMotion warp_params;
+ warp_params.params[0] = 24748;
+ warp_params.params[1] = -142530;
+ warp_params.params[2] = 65516;
+ warp_params.params[3] = -640;
+ warp_params.params[4] = 256;
+ warp_params.params[5] = 65310;
+ EXPECT_TRUE(SetupShear(&warp_params));
+ EXPECT_EQ(warp_params.alpha, 0);
+ EXPECT_EQ(warp_params.beta, -640);
+ EXPECT_EQ(warp_params.gamma, 256);
+ EXPECT_EQ(warp_params.delta, -192);
+
+ warp_params.params[0] = 24748;
+ warp_params.params[1] = -142530;
+ warp_params.params[2] = 61760;
+ warp_params.params[3] = -640;
+ warp_params.params[4] = -13312;
+ warp_params.params[5] = 65310;
+ EXPECT_TRUE(SetupShear(&warp_params));
+ EXPECT_EQ(warp_params.alpha, -3776);
+ EXPECT_EQ(warp_params.beta, -640);
+ EXPECT_EQ(warp_params.gamma, -14144);
+ EXPECT_EQ(warp_params.delta, -384);
+}
+
+struct WarpInputParam {
+ WarpInputParam(int num_samples, int block_width4x4, int block_height4x4)
+ : num_samples(num_samples),
+ block_width4x4(block_width4x4),
+ block_height4x4(block_height4x4) {}
+ int num_samples;
+ int block_width4x4;
+ int block_height4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const WarpInputParam& param) {
+ return os << "num_samples: " << param.num_samples
+ << ", block_(width/height)4x4: " << param.block_width4x4 << "x"
+ << param.block_height4x4;
+}
+
+const WarpInputParam warp_test_param[] = {
+ // sample = 1.
+ WarpInputParam(1, 1, 1),
+ WarpInputParam(1, 1, 2),
+ WarpInputParam(1, 2, 1),
+ WarpInputParam(1, 2, 2),
+ WarpInputParam(1, 2, 4),
+ WarpInputParam(1, 4, 2),
+ WarpInputParam(1, 4, 4),
+ WarpInputParam(1, 4, 8),
+ WarpInputParam(1, 8, 4),
+ WarpInputParam(1, 8, 8),
+ WarpInputParam(1, 8, 16),
+ WarpInputParam(1, 16, 8),
+ WarpInputParam(1, 16, 16),
+ WarpInputParam(1, 16, 32),
+ WarpInputParam(1, 32, 16),
+ WarpInputParam(1, 32, 32),
+ // sample = 8.
+ WarpInputParam(8, 1, 1),
+ WarpInputParam(8, 1, 2),
+ WarpInputParam(8, 2, 1),
+ WarpInputParam(8, 2, 2),
+ WarpInputParam(8, 2, 4),
+ WarpInputParam(8, 4, 2),
+ WarpInputParam(8, 4, 4),
+ WarpInputParam(8, 4, 8),
+ WarpInputParam(8, 8, 4),
+ WarpInputParam(8, 8, 8),
+ WarpInputParam(8, 8, 16),
+ WarpInputParam(8, 16, 8),
+ WarpInputParam(8, 16, 16),
+ WarpInputParam(8, 16, 32),
+ WarpInputParam(8, 32, 16),
+ WarpInputParam(8, 32, 32),
+};
+
+constexpr bool kExpectedWarpEstimationValid[2] = {false, true};
+
+constexpr int kExpectedWarpEstimationOutput[16][6] = {
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {8388607, 8388607, 57345, -8191, -8191, 57345},
+ {2146296, 1589240, 57345, 8191, -8191, 73727},
+ {1753128, 1196072, 73727, -8191, 8191, 57345},
+ {-8388608, -8388608, 73727, 8191, 8191, 73727},
+ {-4435485, -8388608, 65260, 8191, 8191, 73727},
+ {-8388608, -7552929, 73727, 8191, 8191, 68240},
+ {-8388608, -8388608, 73727, 8191, 8191, 70800},
+};
+
+class WarpEstimationTest : public testing::TestWithParam<WarpInputParam> {
+ public:
+ WarpEstimationTest() = default;
+ ~WarpEstimationTest() override = default;
+
+ protected:
+ WarpInputParam param_ = GetParam();
+};
+
+TEST_P(WarpEstimationTest, WarpEstimation) {
+ // Set input params.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int row4x4 = rnd.Rand8();
+ const int column4x4 = rnd.Rand8();
+ MotionVector mv;
+ mv.mv[0] = rnd.Rand8();
+ mv.mv[1] = rnd.Rand8();
+ int candidates[kMaxLeastSquaresSamples][4];
+ for (int i = 0; i < param_.num_samples; ++i) {
+ // Make candidates relative to the top left of frame.
+ candidates[i][0] = rnd.Rand8() + MultiplyBy32(row4x4);
+ candidates[i][1] = rnd.Rand8() + MultiplyBy32(column4x4);
+ candidates[i][2] = rnd.Rand8() + MultiplyBy32(row4x4);
+ candidates[i][3] = rnd.Rand8() + MultiplyBy32(column4x4);
+ }
+
+ // Get output.
+ GlobalMotion warp_params;
+ const bool warp_success = WarpEstimation(
+ param_.num_samples, param_.block_width4x4, param_.block_height4x4, row4x4,
+ column4x4, mv, candidates, &warp_params);
+ if (param_.num_samples == 1) {
+ EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[0]);
+ } else {
+ EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[1]);
+ int index = FloorLog2(param_.block_width4x4) * 3 - 1;
+ if (param_.block_width4x4 == param_.block_height4x4) {
+ index += 1;
+ } else if (param_.block_width4x4 < param_.block_height4x4) {
+ index += 2;
+ }
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(warp_params.params); ++i) {
+ EXPECT_EQ(warp_params.params[i], kExpectedWarpEstimationOutput[index][i]);
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(WarpFuncTest, WarpEstimationTest,
+ testing::ValuesIn(warp_test_param));
+} // namespace
+} // namespace libgav1
--- /dev/null
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/yuv_buffer.h"
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+// Size conventions:
+// * Widths, heights, and border sizes are in pixels.
+// * Strides and plane sizes are in bytes.
+//
+// YuvBuffer objects may be reused through the BufferPool. Realloc() must
+// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may
+// contain stale values from the previous use, and must set all data members
+// from scratch. In particular, Realloc() must not rely on the initial values
+// of data members set by the YuvBuffer constructor.
+bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int8_t subsampling_x, int8_t subsampling_y,
+ int left_border, int right_border, int top_border,
+ int bottom_border,
+ GetFrameBufferCallback get_frame_buffer,
+ void* callback_private_data,
+ void** buffer_private_data) {
+ // Only support allocating buffers that have borders that are a multiple of
+ // 2. The border restriction is required because we may subsample the
+ // borders in the chroma planes.
+ if (((left_border | right_border | top_border | bottom_border) & 1) != 0) {
+ LIBGAV1_DLOG(ERROR,
+ "Borders must be a multiple of 2: left_border = %d, "
+ "right_border = %d, top_border = %d, bottom_border = %d.",
+ left_border, right_border, top_border, bottom_border);
+ return false;
+ }
+
+ // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte
+ // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes,
+ // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte
+ // aligned.
+ const int plane_align = kFrameBufferRowAlignment;
+ const int uv_width =
+ is_monochrome ? 0 : SubsampledValue(width, subsampling_x);
+ const int uv_height =
+ is_monochrome ? 0 : SubsampledValue(height, subsampling_y);
+ const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+ const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+ const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+ const int uv_bottom_border =
+ is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+ if (get_frame_buffer != nullptr) {
+ assert(buffer_private_data != nullptr);
+
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y);
+ FrameBuffer frame_buffer;
+ if (get_frame_buffer(callback_private_data, bitdepth, image_format, width,
+ height, left_border, right_border, top_border,
+ bottom_border, kFrameBufferRowAlignment,
+ &frame_buffer) != kStatusOk) {
+ return false;
+ }
+
+ if (frame_buffer.plane[0] == nullptr ||
+ (!is_monochrome && frame_buffer.plane[1] == nullptr) ||
+ (!is_monochrome && frame_buffer.plane[2] == nullptr)) {
+ assert(false && "The get_frame_buffer callback malfunctioned.");
+ LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned.");
+ return false;
+ }
+
+ stride_[kPlaneY] = frame_buffer.stride[0];
+ stride_[kPlaneU] = frame_buffer.stride[1];
+ stride_[kPlaneV] = frame_buffer.stride[2];
+ buffer_[kPlaneY] = frame_buffer.plane[0];
+ buffer_[kPlaneU] = frame_buffer.plane[1];
+ buffer_[kPlaneV] = frame_buffer.plane[2];
+ *buffer_private_data = frame_buffer.private_data;
+ } else {
+ assert(callback_private_data == nullptr);
+ assert(buffer_private_data == nullptr);
+
+ // Calculate y_stride (in bytes). It is padded to a multiple of
+ // kFrameBufferRowAlignment bytes.
+ int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+ y_stride = Align(y_stride, kFrameBufferRowAlignment);
+ // Size of the Y plane in bytes.
+ const uint64_t y_plane_size = (height + top_border + bottom_border) *
+ static_cast<uint64_t>(y_stride) +
+ (plane_align - 1);
+
+ // Calculate uv_stride (in bytes). It is padded to a multiple of
+ // kFrameBufferRowAlignment bytes.
+ int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+ uv_stride = Align(uv_stride, kFrameBufferRowAlignment);
+ // Size of the U or V plane in bytes.
+ const uint64_t uv_plane_size =
+ is_monochrome ? 0
+ : (uv_height + uv_top_border + uv_bottom_border) *
+ static_cast<uint64_t>(uv_stride) +
+ (plane_align - 1);
+
+ // Allocate unaligned y_buffer, u_buffer, and v_buffer.
+ uint8_t* y_buffer = nullptr;
+ uint8_t* u_buffer = nullptr;
+ uint8_t* v_buffer = nullptr;
+
+ const uint64_t frame_size = y_plane_size + 2 * uv_plane_size;
+ if (frame_size > buffer_alloc_size_) {
+ // Allocation to hold larger frame, or first allocation.
+ if (frame_size != static_cast<size_t>(frame_size)) return false;
+
+ buffer_alloc_.reset(new (std::nothrow)
+ uint8_t[static_cast<size_t>(frame_size)]);
+ if (buffer_alloc_ == nullptr) {
+ buffer_alloc_size_ = 0;
+ return false;
+ }
+
+ buffer_alloc_size_ = static_cast<size_t>(frame_size);
+ }
+
+ y_buffer = buffer_alloc_.get();
+ if (!is_monochrome) {
+ u_buffer = y_buffer + y_plane_size;
+ v_buffer = u_buffer + uv_plane_size;
+ }
+
+ stride_[kPlaneY] = y_stride;
+ stride_[kPlaneU] = stride_[kPlaneV] = uv_stride;
+
+ int left_border_bytes = left_border;
+ int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth > 8) {
+ left_border_bytes *= sizeof(uint16_t);
+ uv_left_border_bytes *= sizeof(uint16_t);
+ }
+#endif
+ buffer_[kPlaneY] = AlignAddr(
+ y_buffer + (top_border * y_stride) + left_border_bytes, plane_align);
+ buffer_[kPlaneU] =
+ AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+ plane_align);
+ buffer_[kPlaneV] =
+ AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+ plane_align);
+ }
+
+ y_width_ = width;
+ y_height_ = height;
+ left_border_[kPlaneY] = left_border;
+ right_border_[kPlaneY] = right_border;
+ top_border_[kPlaneY] = top_border;
+ bottom_border_[kPlaneY] = bottom_border;
+
+ uv_width_ = uv_width;
+ uv_height_ = uv_height;
+ left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border;
+ right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border;
+ top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border;
+ bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border;
+
+ subsampling_x_ = subsampling_x;
+ subsampling_y_ = subsampling_y;
+
+ bitdepth_ = bitdepth;
+ is_monochrome_ = is_monochrome;
+ assert(!is_monochrome || stride_[kPlaneU] == 0);
+ assert(!is_monochrome || stride_[kPlaneV] == 0);
+ assert(!is_monochrome || buffer_[kPlaneU] == nullptr);
+ assert(!is_monochrome || buffer_[kPlaneV] == nullptr);
+
+#if LIBGAV1_MSAN
+ InitializeFrameBorders();
+#endif
+
+ return true;
+}
+
+#if LIBGAV1_MSAN
+void YuvBuffer::InitializeFrameBorders() {
+ const int pixel_size = (bitdepth_ == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ const int y_width_in_bytes = y_width_ * pixel_size;
+ // The optimized loop restoration code will overread the visible frame buffer
+ // into the right border. The optimized cfl subsambler uses the right border
+ // as well. Initialize the right border and padding to prevent msan warnings.
+ const int y_right_border_size_in_bytes = right_border_[kPlaneY] * pixel_size;
+ // Calculate the padding bytes for the buffer. Note: The stride of the buffer
+ // is always a multiple of 16. (see yuv_buffer.h)
+ const int y_right_padding_in_bytes =
+ stride_[kPlaneY] - (pixel_size * (y_width_ + left_border_[kPlaneY] +
+ right_border_[kPlaneY]));
+ const int y_padded_right_border_size =
+ y_right_border_size_in_bytes + y_right_padding_in_bytes;
+ constexpr uint8_t kRightValue = 0x55;
+ uint8_t* rb = buffer_[kPlaneY] + y_width_in_bytes;
+ for (int i = 0; i < y_height_ + bottom_border_[kPlaneY]; ++i) {
+ memset(rb, kRightValue, y_padded_right_border_size);
+ rb += stride_[kPlaneY];
+ }
+
+ if (!is_monochrome_) {
+ const int uv_width_in_bytes = uv_width_ * pixel_size;
+ const int uv_right_border_size_in_bytes =
+ right_border_[kPlaneU] * pixel_size;
+ assert(right_border_[kPlaneU] == right_border_[kPlaneV]);
+ const int u_right_padding_in_bytes =
+ stride_[kPlaneU] - (pixel_size * (uv_width_ + left_border_[kPlaneU] +
+ right_border_[kPlaneU]));
+ const int u_padded_right_border_size =
+ uv_right_border_size_in_bytes + u_right_padding_in_bytes;
+ rb = buffer_[kPlaneU] + uv_width_in_bytes;
+ for (int i = 0; i < uv_height_; ++i) {
+ memset(rb, kRightValue, u_padded_right_border_size);
+ rb += stride_[kPlaneU];
+ }
+ const int v_right_padding_in_bytes =
+ stride_[kPlaneV] -
+ ((uv_width_ + left_border_[kPlaneV] + right_border_[kPlaneV]) *
+ pixel_size);
+ const int v_padded_right_border_size =
+ uv_right_border_size_in_bytes + v_right_padding_in_bytes;
+ rb = buffer_[kPlaneV] + uv_width_in_bytes;
+ for (int i = 0; i < uv_height_; ++i) {
+ memset(rb, kRightValue, v_padded_right_border_size);
+ rb += stride_[kPlaneV];
+ }
+ }
+
+ // The optimized cfl subsampler will overread (to the right of the current
+ // block) into the uninitialized visible area. The cfl subsampler can overread
+ // into the bottom border as well. Initialize the both to quiet msan warnings.
+ uint8_t* y_visible = buffer_[kPlaneY];
+ for (int i = 0; i < y_height_ + bottom_border_[kPlaneY]; ++i) {
+ memset(y_visible, kRightValue, y_width_in_bytes);
+ y_visible += stride_[kPlaneY];
+ }
+}
+#endif // LIBGAV1_MSAN
+
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_YUV_BUFFER_H_
+#define LIBGAV1_SRC_YUV_BUFFER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+class YuvBuffer {
+ public:
+ // Allocates the buffer. Returns true on success. Returns false on failure.
+ //
+ // * |width| and |height| are the image dimensions in pixels.
+ // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+ // subsampling of the width and height of the chroma planes, respectively.
+ // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+ // the sizes (in pixels) of the borders on the left, right, top, and
+ // bottom sides, respectively. The four border sizes must all be a
+ // multiple of 2.
+ // * If |get_frame_buffer| is not null, it is invoked to allocate the memory.
+ // If |get_frame_buffer| is null, YuvBuffer allocates the memory directly
+ // and ignores the |callback_private_data| and |buffer_private_data|
+ // parameters, which should be null.
+ //
+ // NOTE: The strides are a multiple of 16. Since the first row in each plane
+ // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+ //
+ // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The
+ // diagram below shows how Realloc() allocates the data buffer for the Y
+ // plane.
+ //
+ // 16-byte aligned
+ // |
+ // v
+ // ++++++++++++++++++++++++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // ++01234567890123456789++pppppppp
+ // ++11234567890123456789++pppppppp
+ // ++21234567890123456789++pppppppp
+ // ++31234567890123456789++pppppppp
+ // ++41234567890123456789++pppppppp
+ // ++51234567890123456789++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // ++++++++++++++++++++++++pppppppp
+ // | |
+ // |<-- stride (multiple of 16) ->|
+ //
+ // The video frame has 6 rows of 20 pixels each. Each row is shown as the
+ // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5.
+ //
+ // Realloc() first adds a border of 2 pixels around the video frame. The
+ // border pixels are shown as '+'.
+ //
+ // Each row is then padded to a multiple of the default alignment in bytes,
+ // which is 16. The padding bytes are shown as lowercase 'p'. (Since
+ // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size
+ // in bytes is the stride. In this example, the stride is 32 bytes.
+ //
+ // Finally, Realloc() aligns the first byte of frame data, which is the '0'
+ // pixel/byte in the upper left corner of the frame, to the default (16-byte)
+ // alignment boundary.
+ //
+ // TODO(wtc): Add a check for width and height limits to defend against
+ // invalid bitstreams.
+ bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+ int8_t subsampling_x, int8_t subsampling_y, int left_border,
+ int right_border, int top_border, int bottom_border,
+ GetFrameBufferCallback get_frame_buffer,
+ void* callback_private_data, void** buffer_private_data);
+
+ int bitdepth() const { return bitdepth_; }
+
+ bool is_monochrome() const { return is_monochrome_; }
+
+ int8_t subsampling_x() const { return subsampling_x_; }
+ int8_t subsampling_y() const { return subsampling_y_; }
+
+ int width(int plane) const {
+ return (plane == kPlaneY) ? y_width_ : uv_width_;
+ }
+ int height(int plane) const {
+ return (plane == kPlaneY) ? y_height_ : uv_height_;
+ }
+
+ // Returns border sizes in pixels.
+ int left_border(int plane) const { return left_border_[plane]; }
+ int right_border(int plane) const { return right_border_[plane]; }
+ int top_border(int plane) const { return top_border_[plane]; }
+ int bottom_border(int plane) const { return bottom_border_[plane]; }
+
+ // Returns the alignment of frame buffer row in bytes.
+ int alignment() const { return kFrameBufferRowAlignment; }
+
+ // Backup the current set of warnings and disable -Warray-bounds for the
+ // following three functions as the compiler cannot, in all cases, determine
+ // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for
+ // loop.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+ // Returns the data buffer for |plane|.
+ uint8_t* data(int plane) {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+ return buffer_[plane];
+ }
+ const uint8_t* data(int plane) const {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+ return buffer_[plane];
+ }
+
+ // Returns the stride in bytes for |plane|.
+ int stride(int plane) const {
+ assert(plane >= 0);
+ assert(static_cast<size_t>(plane) < std::extent<decltype(stride_)>::value);
+ return stride_[plane];
+ }
+ // Restore the previous set of compiler warnings.
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+ private:
+ static constexpr int kFrameBufferRowAlignment = 16;
+
+#if LIBGAV1_MSAN
+ void InitializeFrameBorders();
+#endif
+
+ int bitdepth_ = 0;
+ bool is_monochrome_ = false;
+
+ // y_width_ and y_height_ are the |width| and |height| arguments passed to the
+ // Realloc() method.
+ //
+ // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as
+ // follows:
+ // uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_
+ // uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_
+ int y_width_ = 0;
+ int uv_width_ = 0;
+ int y_height_ = 0;
+ int uv_height_ = 0;
+
+ int left_border_[kMaxPlanes] = {};
+ int right_border_[kMaxPlanes] = {};
+ int top_border_[kMaxPlanes] = {};
+ int bottom_border_[kMaxPlanes] = {};
+
+ int stride_[kMaxPlanes] = {};
+ uint8_t* buffer_[kMaxPlanes] = {};
+
+ // buffer_alloc_ and buffer_alloc_size_ are only used if the
+ // get_frame_buffer callback is null and we allocate the buffer ourselves.
+ std::unique_ptr<uint8_t[]> buffer_alloc_;
+ size_t buffer_alloc_size_ = 0;
+
+ int8_t subsampling_x_ = 0; // 0 or 1.
+ int8_t subsampling_y_ = 0; // 0 or 1.
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_YUV_BUFFER_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/block_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+#define LIBGAV1_DEBUG_FORMAT_CODE "x"
+template <typename Pixel>
+void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2,
+ const bool print_padding) {
+ const int print_width = print_padding ? std::min(stride1, stride2) : width;
+ const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+
+ for (int y = 0; y < height; ++y) {
+ printf("[%2d] ", y);
+ for (int x = 0; x < print_width; ++x) {
+ if (x >= width) {
+ if (block1[x] == block2[x]) {
+ printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block1[x]);
+ } else {
+ printf("[*%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width - 1,
+ block1[x]);
+ }
+ } else {
+ if (block1[x] == block2[x]) {
+ printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block1[x]);
+ } else {
+ printf("*%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width - 1,
+ block1[x]);
+ }
+ }
+ }
+ printf("\n");
+ block1 += stride1;
+ block2 += stride2;
+ }
+}
+
+} // namespace
+
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+ const bool print_padding /*= false*/) {
+ const int print_width = print_padding ? stride : width;
+ const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+ for (int y = 0; y < height; ++y) {
+ printf("[%2d] ", y);
+ for (int x = 0; x < print_width; ++x) {
+ if (x >= width) {
+ printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block[x]);
+ } else {
+ printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block[x]);
+ }
+ }
+ printf("\n");
+ block += stride;
+ }
+}
+#undef LIBGAV1_DEBUG_FORMAT_CODE
+
+template void PrintBlock(const uint8_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+template void PrintBlock(const uint16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+template void PrintBlock(const int8_t* block, int width, int height, int stride,
+ bool print_padding /*= false*/);
+template void PrintBlock(const int16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2,
+ const bool check_padding, const bool print_diff /*= true*/) {
+ bool ok = true;
+ const int check_width = check_padding ? std::min(stride1, stride2) : width;
+ for (int y = 0; y < height; ++y) {
+ const uint64_t row1 = static_cast<uint64_t>(y) * stride1;
+ const uint64_t row2 = static_cast<uint64_t>(y) * stride2;
+ ok = memcmp(block1 + row1, block2 + row2,
+ sizeof(block1[0]) * check_width) == 0;
+ if (!ok) break;
+ }
+ if (!ok && print_diff) {
+ printf("block1 (width: %d height: %d stride: %d):\n", width, height,
+ stride1);
+ PrintBlockDiff(block1, block2, width, height, stride1, stride2,
+ check_padding);
+ printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height,
+ stride2);
+ PrintBlockDiff(block2, block1, width, height, stride2, stride1,
+ check_padding);
+ }
+ return ok;
+}
+
+template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const int8_t* block1, const int8_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const int16_t* block1, const int16_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+
+} // namespace test_utils
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_
+#define LIBGAV1_TESTS_BLOCK_UTILS_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+namespace test_utils {
+
+//------------------------------------------------------------------------------
+// Prints |block| pixel by pixel with |width| pixels per row if |print_padding|
+// is false, |stride| otherwise. If |print_padding| is true padding pixels are
+// surrounded in '[]'.
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+ bool print_padding = false);
+
+extern template void PrintBlock(const uint8_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+extern template void PrintBlock(const uint16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+
+//------------------------------------------------------------------------------
+// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row
+// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise.
+// Prints the blocks with differences marked with a '*' if |print_diff| is
+// true (the default).
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2, bool check_padding,
+ bool print_diff = true);
+
+extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+ int width, int height, int stride1,
+ int stride2, bool check_padding,
+ bool print_diff /*= true*/);
+extern template bool CompareBlocks(const uint16_t* block1,
+ const uint16_t* block2, int width,
+ int height, int stride1, int stride2,
+ bool check_padding,
+ bool print_diff /*= true*/);
+
+} // namespace test_utils
+} // namespace libgav1
+
+#endif // LIBGAV1_TESTS_BLOCK_UTILS_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+void Decode(const uint8_t* const data, const size_t size,
+ libgav1::Decoder* const decoder) {
+ decoder->EnqueueFrame(data, size, /*user_private_data=*/0,
+ /*buffer_private_data=*/nullptr);
+ const libgav1::DecoderBuffer* buffer;
+ decoder->DequeueFrame(&buffer);
+}
+
+} // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings = {};
+ // Use the low byte of the width to seed the number of threads.
+ // We use both nibbles of the lower byte as this results in values != 1 much
+ // more quickly than using the lower nibble alone.
+ settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1;
+ if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+ // Treat the input as a raw OBU stream.
+ Decode(data, size, &decoder);
+
+ // Use the first frame from an IVF to bypass any read errors from the parser.
+ static constexpr size_t kIvfHeaderSize =
+ libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+ if (size >= kIvfHeaderSize) {
+ Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder);
+ }
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ std::vector<uint8_t> buffer;
+ int decoded_frames = 0;
+ do {
+ if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+ Decode(buffer.data(), buffer.size(), &decoder);
+ if (++decoded_frames >= kMaxFrames) break;
+ } while (!file_reader->IsEndOfFile());
+
+ return 0;
+}
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "src/gav1/status_code.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+using InputBuffer = std::vector<uint8_t>;
+
+struct InputBuffers {
+ ~InputBuffers() {
+ for (auto& buffer : free_buffers) {
+ delete buffer;
+ }
+ }
+ std::deque<InputBuffer*> free_buffers;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* const test = static_cast<InputBuffers*>(callback_private_data);
+ test->free_buffers.push_back(static_cast<InputBuffer*>(buffer_private_data));
+}
+
+} // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ // Note that |input_buffers| has to outlive the |decoder| object since the
+ // |release_input_buffer| callback could be called on the |decoder|'s
+ // destructor.
+ InputBuffers input_buffers;
+
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings = {};
+ // Use the 33 + low byte of the width to seed the number of threads. This
+ // ensures that we will trigger the frame parallel path in most cases.
+ // We use both nibbles of the lower byte as this results in values != 1 much
+ // more quickly than using the lower nibble alone.
+ settings.threads =
+ 33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1);
+
+ settings.frame_parallel = true;
+ settings.blocking_dequeue = true;
+ settings.callback_private_data = &input_buffers;
+ settings.release_input_buffer = ReleaseInputBuffer;
+ if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ InputBuffer* input_buffer = nullptr;
+ bool dequeue_finished = false;
+
+ do {
+ if (input_buffer == nullptr && !file_reader->IsEndOfFile()) {
+ if (input_buffers.free_buffers.empty()) {
+ auto* const buffer = new (std::nothrow) InputBuffer();
+ if (buffer == nullptr) {
+ break;
+ }
+ input_buffers.free_buffers.push_back(buffer);
+ }
+ input_buffer = input_buffers.free_buffers.front();
+ input_buffers.free_buffers.pop_front();
+ if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) {
+ break;
+ }
+ }
+
+ if (input_buffer != nullptr) {
+ libgav1::StatusCode status =
+ decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+ /*user_private_data=*/0,
+ /*buffer_private_data=*/input_buffer);
+ if (status == libgav1::kStatusOk) {
+ input_buffer = nullptr;
+ // Continue to enqueue frames until we get a kStatusTryAgain status.
+ continue;
+ }
+ if (status != libgav1::kStatusTryAgain) {
+ break;
+ }
+ }
+
+ const libgav1::DecoderBuffer* buffer;
+ libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ } else if (status == libgav1::kStatusOk) {
+ dequeue_finished = false;
+ } else {
+ break;
+ }
+ } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
+ !dequeue_finished);
+
+ if (input_buffer != nullptr) {
+ input_buffers.free_buffers.push_back(input_buffer);
+ }
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <io.h>
+#include <windows.h>
+
+#define strdup _strdup
+#define unlink _unlink
+#else
+#include <unistd.h>
+#endif // _WIN32
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size,
+ const char* suffix) {
+#ifdef _WIN32
+ // GetTempPathA generates '<path>\<pre><uuuu>.TMP'.
+ (void)suffix; // NOLINT (this could be a C compilation unit)
+ char temp_path[MAX_PATH];
+ const DWORD ret = GetTempPathA(MAX_PATH, temp_path);
+ if (ret == 0 || ret > MAX_PATH) {
+ fprintf(stderr, "Error getting temporary directory name: %lu\n",
+ GetLastError());
+ abort();
+ }
+ char* filename_buffer =
+ (char*)malloc(MAX_PATH); // NOLINT (this could be a C compilation unit)
+ if (!filename_buffer) {
+ perror("Failed to allocate file name buffer.");
+ abort();
+ }
+ if (GetTempFileNameA(temp_path, "ftf", /*uUnique=*/0, filename_buffer) == 0) {
+ fprintf(stderr, "Error getting temporary file name: %lu\n", GetLastError());
+ abort();
+ }
+#if defined(_MSC_VER) || defined(MINGW_HAS_SECURE_API)
+ FILE* file;
+ const errno_t err = fopen_s(&file, filename_buffer, "wb");
+ if (err != 0) file = NULL; // NOLINT (this could be a C compilation unit)
+#else
+ FILE* file = fopen(filename_buffer, "wb");
+#endif
+ if (!file) {
+ perror("Failed to open file.");
+ abort();
+ }
+#else // !_WIN32
+ if (suffix == NULL) { // NOLINT (this could be a C compilation unit)
+ suffix = "";
+ }
+ const size_t suffix_len = strlen(suffix);
+ if (suffix_len > INT_MAX) { // mkstemps takes int for suffixlen param
+ perror("Suffix too long");
+ abort();
+ }
+
+#ifdef __ANDROID__
+ const char* leading_temp_path =
+ "/data/local/tmp/generate_temporary_file.XXXXXX";
+#else
+ const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX";
+#endif
+ const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1;
+ char* filename_buffer =
+ (char*)malloc(buffer_sz); // NOLINT (this could be a C compilation unit)
+ if (!filename_buffer) {
+ perror("Failed to allocate file name buffer.");
+ abort();
+ }
+
+ if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >=
+ (int)buffer_sz) { // NOLINT (this could be a C compilation unit)
+ perror("File name buffer too short.");
+ abort();
+ }
+
+ const int file_descriptor = mkstemps(filename_buffer, suffix_len);
+ if (file_descriptor < 0) {
+ perror("Failed to make temporary file.");
+ abort();
+ }
+ FILE* file = fdopen(file_descriptor, "wb");
+ if (!file) {
+ perror("Failed to open file descriptor.");
+ close(file_descriptor);
+ abort();
+ }
+#endif // _WIN32
+ const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+ if (bytes_written < size) {
+ fclose(file);
+ fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+ bytes_written, size);
+ abort();
+ }
+ fclose(file);
+ return filename_buffer;
+}
+
+static char* fuzzer_get_tmpfile(
+ const uint8_t* data,
+ size_t size) { // NOLINT (people include this .inc file directly)
+ return fuzzer_get_tmpfile_with_suffix(data, size, NULL); // NOLINT
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+ if (unlink(filename) != 0) {
+ perror("WARNING: Failed to delete temporary file.");
+ }
+ free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+ FuzzerTemporaryFile(const uint8_t* data, size_t size)
+ : original_filename_(fuzzer_get_tmpfile(data, size)) {
+ filename_ = strdup(original_filename_);
+ if (!filename_) {
+ perror("Failed to allocate file name copy.");
+ abort();
+ }
+ }
+
+ FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix)
+ : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) {
+ filename_ = strdup(original_filename_);
+ if (!filename_) {
+ perror("Failed to allocate file name copy.");
+ abort();
+ }
+ }
+
+ ~FuzzerTemporaryFile() {
+ free(filename_);
+ fuzzer_release_tmpfile(original_filename_);
+ }
+
+ FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete;
+ FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete;
+
+ FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete;
+ FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete;
+
+ const char* filename() const { return filename_; }
+
+ // Returns a mutable pointer to the file name. Should be used sparingly, only
+ // in case the fuzzed API demands it or when making a mutable copy is
+ // inconvenient (e.g., in auto-generated code).
+ char* mutable_filename() const { return filename_; }
+
+ private:
+ char* original_filename_;
+
+ // A mutable copy of the original filename, returned by the accessor. This
+ // guarantees that the original filename can always be used to release the
+ // temporary path.
+ char* filename_;
+};
+#endif // __cplusplus
+#endif // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/obu_parser.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames and obus to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+inline void ParseObu(const uint8_t* const data, size_t size) {
+ size_t av1c_size;
+ const std::unique_ptr<uint8_t[]> av1c_box =
+ libgav1::ObuParser::GetAV1CodecConfigurationBox(data, size, &av1c_size);
+ static_cast<void>(av1c_box);
+
+ libgav1::InternalFrameBufferList buffer_list;
+ libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged,
+ libgav1::GetInternalFrameBuffer,
+ libgav1::ReleaseInternalFrameBuffer,
+ &buffer_list);
+ libgav1::DecoderState decoder_state;
+ libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state);
+ libgav1::RefCountedBufferPtr current_frame;
+ int parsed_frames = 0;
+ while (parser.HasData()) {
+ if (parser.ParseOneFrame(¤t_frame) != libgav1::kStatusOk) break;
+ if (++parsed_frames >= kMaxFrames) break;
+ }
+}
+
+} // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // Reject large chunks of data to improve fuzzer throughput.
+ if (size > kMaxDataSize) return 0;
+
+ // Treat the input as a raw OBU stream.
+ ParseObu(data, size);
+
+ // Use the first frame from an IVF to bypass any read errors from the parser.
+ static constexpr size_t kIvfHeaderSize =
+ libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+ if (size >= kIvfHeaderSize) {
+ ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize);
+ }
+
+ FuzzerTemporaryFile tempfile(data, size);
+ auto file_reader =
+ libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+ if (file_reader == nullptr) return 0;
+
+ std::vector<uint8_t> buffer;
+ int parsed_frames = 0;
+ do {
+ if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+ ParseObu(buffer.data(), buffer.size());
+ if (++parsed_frames >= kMaxFrames) break;
+ } while (!file_reader->IsEndOfFile());
+
+ return 0;
+}
--- /dev/null
+# Copyright 2020 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_LIBGAV1_TESTS_CMAKE_)
+ return()
+endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_
+set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1)
+
+set(libgav1_googletest "${libgav1_root}/third_party/googletest")
+if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}")
+ macro(libgav1_add_tests_targets)
+
+ endmacro()
+
+ if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}")
+ message(
+ "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n"
+ "To enable tests download the GoogleTest repository to"
+ " third_party/googletest:\n\n git \\\n -C ${libgav1_root} \\\n"
+ " clone -b release-1.12.1 --depth 1 \\\n"
+ " https://github.com/google/googletest.git third_party/googletest\n")
+ set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+ endif()
+ return()
+endif()
+
+# Check GoogleTest compiler requirements.
+if((CMAKE_CXX_COMPILER_ID
+ MATCHES
+ "Clang|GNU"
+ AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5")
+ OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19"))
+ macro(libgav1_add_tests_targets)
+
+ endmacro()
+
+ message(
+ WARNING
+ "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version"
+ " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for"
+ " GoogleTest; disabling unit tests. See"
+ " https://github.com/google/googletest#compilers for more detail.")
+ set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+ return()
+endif()
+
+list(APPEND libgav1_tests_block_utils_sources
+ "${libgav1_root}/tests/block_utils.h"
+ "${libgav1_root}/tests/block_utils.cc")
+
+list(APPEND libgav1_tests_utils_sources
+ "${libgav1_root}/tests/third_party/libvpx/acm_random.h"
+ "${libgav1_root}/tests/third_party/libvpx/md5_helper.h"
+ "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc"
+ "${libgav1_root}/tests/third_party/libvpx/md5_utils.h"
+ "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc")
+
+list(APPEND libgav1_tests_utils_test_sources
+ "${libgav1_root}/tests/utils_test.cc")
+
+list(APPEND libgav1_array_2d_test_sources
+ "${libgav1_source}/utils/array_2d_test.cc")
+list(APPEND libgav1_average_blend_test_sources
+ "${libgav1_source}/dsp/average_blend_test.cc")
+list(APPEND libgav1_block_parameters_holder_test_sources
+ "${libgav1_source}/utils/block_parameters_holder_test.cc")
+list(APPEND libgav1_blocking_counter_test_sources
+ "${libgav1_source}/utils/blocking_counter_test.cc")
+list(APPEND libgav1_buffer_pool_test_sources
+ "${libgav1_source}/buffer_pool_test.cc")
+list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc")
+list(
+ APPEND libgav1_common_test_sources "${libgav1_source}/utils/common_test.cc")
+list(APPEND libgav1_common_avx2_test_sources
+ "${libgav1_source}/dsp/x86/common_avx2.h"
+ "${libgav1_source}/dsp/x86/common_avx2.inc"
+ "${libgav1_source}/dsp/x86/common_avx2_test.cc"
+ "${libgav1_source}/dsp/x86/common_avx2_test.h"
+ "${libgav1_source}/dsp/x86/common_sse4.inc")
+list(APPEND libgav1_common_dsp_test_sources
+ "${libgav1_source}/dsp/common_dsp_test.cc")
+list(APPEND libgav1_common_neon_test_sources
+ "${libgav1_source}/dsp/arm/common_neon_test.cc")
+list(APPEND libgav1_common_sse4_test_sources
+ "${libgav1_source}/dsp/x86/common_sse4.h"
+ "${libgav1_source}/dsp/x86/common_sse4.inc"
+ "${libgav1_source}/dsp/x86/common_sse4_test.cc"
+ "${libgav1_source}/dsp/x86/common_sse4_test.h")
+list(APPEND libgav1_convolve_test_sources
+ "${libgav1_source}/dsp/convolve_test.cc")
+list(APPEND libgav1_cpu_test_sources "${libgav1_source}/utils/cpu_test.cc")
+list(APPEND libgav1_c_decoder_test_sources
+ "${libgav1_source}/c_decoder_test.c"
+ "${libgav1_source}/decoder_test_data.h")
+list(APPEND libgav1_c_version_test_sources "${libgav1_source}/c_version_test.c")
+list(APPEND libgav1_decoder_test_sources
+ "${libgav1_source}/decoder_test.cc"
+ "${libgav1_source}/decoder_test_data.h")
+list(APPEND libgav1_decoder_buffer_test_sources
+ "${libgav1_source}/decoder_buffer_test.cc")
+list(APPEND libgav1_distance_weighted_blend_test_sources
+ "${libgav1_source}/dsp/distance_weighted_blend_test.cc")
+list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc")
+list(APPEND libgav1_entropy_decoder_test_sources
+ "${libgav1_source}/utils/entropy_decoder_test.cc"
+ "${libgav1_source}/utils/entropy_decoder_test_data.inc")
+list(APPEND libgav1_file_reader_test_sources
+ "${libgav1_examples}/file_reader_test.cc"
+ "${libgav1_examples}/file_reader_test_common.cc"
+ "${libgav1_examples}/file_reader_test_common.h")
+list(APPEND libgav1_film_grain_test_sources
+ "${libgav1_source}/film_grain_test.cc")
+list(APPEND libgav1_file_reader_factory_test_sources
+ "${libgav1_examples}/file_reader_factory_test.cc")
+list(APPEND libgav1_file_writer_test_sources
+ "${libgav1_examples}/file_writer_test.cc")
+list(APPEND libgav1_internal_frame_buffer_list_test_sources
+ "${libgav1_source}/internal_frame_buffer_list_test.cc")
+list(APPEND libgav1_intra_edge_test_sources
+ "${libgav1_source}/dsp/intra_edge_test.cc")
+list(APPEND libgav1_intrapred_cfl_test_sources
+ "${libgav1_source}/dsp/intrapred_cfl_test.cc")
+list(APPEND libgav1_intrapred_directional_test_sources
+ "${libgav1_source}/dsp/intrapred_directional_test.cc")
+list(APPEND libgav1_intrapred_filter_test_sources
+ "${libgav1_source}/dsp/intrapred_filter_test.cc")
+list(APPEND libgav1_intrapred_test_sources
+ "${libgav1_source}/dsp/intrapred_test.cc")
+list(APPEND libgav1_inverse_transform_test_sources
+ "${libgav1_source}/dsp/inverse_transform_test.cc")
+list(APPEND libgav1_loop_filter_test_sources
+ "${libgav1_source}/dsp/loop_filter_test.cc")
+list(APPEND libgav1_loop_restoration_test_sources
+ "${libgav1_source}/dsp/loop_restoration_test.cc")
+list(APPEND libgav1_mask_blend_test_sources
+ "${libgav1_source}/dsp/mask_blend_test.cc")
+list(APPEND libgav1_motion_field_projection_test_sources
+ "${libgav1_source}/dsp/motion_field_projection_test.cc")
+list(APPEND libgav1_motion_vector_search_test_sources
+ "${libgav1_source}/dsp/motion_vector_search_test.cc")
+list(APPEND libgav1_super_res_test_sources
+ "${libgav1_source}/dsp/super_res_test.cc")
+list(APPEND libgav1_weight_mask_test_sources
+ "${libgav1_source}/dsp/weight_mask_test.cc")
+list(
+ APPEND libgav1_memory_test_sources "${libgav1_source}/utils/memory_test.cc")
+list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc")
+list(APPEND libgav1_obu_parser_test_sources
+ "${libgav1_source}/obu_parser_test.cc")
+list(APPEND libgav1_post_filter_test_sources
+ "${libgav1_source}/post_filter_test.cc")
+list(APPEND libgav1_prediction_mask_test_sources
+ "${libgav1_source}/prediction_mask_test.cc")
+list(
+ APPEND libgav1_quantizer_test_sources "${libgav1_source}/quantizer_test.cc")
+list(APPEND libgav1_queue_test_sources "${libgav1_source}/utils/queue_test.cc")
+list(APPEND libgav1_raw_bit_reader_test_sources
+ "${libgav1_source}/utils/raw_bit_reader_test.cc")
+list(APPEND libgav1_reconstruction_test_sources
+ "${libgav1_source}/reconstruction_test.cc")
+list(APPEND libgav1_residual_buffer_pool_test_sources
+ "${libgav1_source}/residual_buffer_pool_test.cc")
+list(APPEND libgav1_scan_test_sources "${libgav1_source}/scan_test.cc")
+list(APPEND libgav1_segmentation_map_test_sources
+ "${libgav1_source}/utils/segmentation_map_test.cc")
+list(APPEND libgav1_segmentation_test_sources
+ "${libgav1_source}/utils/segmentation_test.cc")
+list(APPEND libgav1_stack_test_sources "${libgav1_source}/utils/stack_test.cc")
+list(APPEND libgav1_symbol_decoder_context_test_sources
+ "${libgav1_source}/symbol_decoder_context_test.cc")
+list(APPEND libgav1_threadpool_test_sources
+ "${libgav1_source}/utils/threadpool_test.cc")
+list(APPEND libgav1_threading_strategy_test_sources
+ "${libgav1_source}/threading_strategy_test.cc")
+list(APPEND libgav1_unbounded_queue_test_sources
+ "${libgav1_source}/utils/unbounded_queue_test.cc")
+list(
+ APPEND libgav1_vector_test_sources "${libgav1_source}/utils/vector_test.cc")
+list(APPEND libgav1_version_test_sources "${libgav1_source}/version_test.cc")
+list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc")
+list(APPEND libgav1_warp_prediction_test_sources
+ "${libgav1_source}/warp_prediction_test.cc")
+
+macro(libgav1_add_tests_targets)
+ if(NOT LIBGAV1_ENABLE_TESTS)
+ message(
+ FATAL_ERROR
+ "This version of libgav1_add_tests_targets() should only be used with"
+ " LIBGAV1_ENABLE_TESTS set to true.")
+ endif()
+ libgav1_add_library(TEST
+ NAME
+ libgav1_gtest
+ TYPE
+ STATIC
+ SOURCES
+ "${libgav1_googletest}/googletest/src/gtest-all.cc"
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_gtest_include_paths}
+ ${libgav1_include_paths})
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_gtest_main
+ TYPE
+ STATIC
+ SOURCES
+ "${libgav1_googletest}/googletest/src/gtest_main.cc"
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_gtest_include_paths}
+ ${libgav1_include_paths})
+
+ if(use_absl_threading)
+ list(APPEND libgav1_common_test_absl_deps absl::synchronization)
+ endif()
+
+ libgav1_add_executable(TEST
+ NAME
+ array_2d_test
+ SOURCES
+ ${libgav1_array_2d_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ block_parameters_holder_test
+ SOURCES
+ ${libgav1_block_parameters_holder_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ blocking_counter_test
+ SOURCES
+ ${libgav1_blocking_counter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ if(libgav1_have_avx2)
+ list(APPEND libgav1_common_dsp_test_sources
+ ${libgav1_common_avx2_test_sources})
+ endif()
+ if(libgav1_have_sse4)
+ list(APPEND libgav1_common_dsp_test_sources
+ ${libgav1_common_sse4_test_sources})
+ endif()
+ if(libgav1_have_avx2 OR libgav1_have_sse4)
+ libgav1_add_executable(TEST
+ NAME
+ common_dsp_test
+ SOURCES
+ ${libgav1_common_dsp_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest_main
+ libgav1_gtest)
+ endif()
+
+ if(libgav1_have_neon)
+ libgav1_add_executable(TEST
+ NAME
+ common_neon_test
+ SOURCES
+ ${libgav1_common_neon_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_tests_block_utils
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+ endif()
+
+ libgav1_add_executable(TEST
+ NAME
+ common_test
+ SOURCES
+ ${libgav1_common_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ cpu_test
+ SOURCES
+ ${libgav1_cpu_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ entropy_decoder_test
+ SOURCES
+ ${libgav1_entropy_decoder_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ file_reader_test
+ SOURCES
+ ${libgav1_file_reader_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_dsp
+ libgav1_file_reader
+ libgav1_utils
+ libgav1_tests_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ file_reader_factory_test
+ SOURCES
+ ${libgav1_file_reader_factory_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_file_reader
+ libgav1_utils
+ LIB_DEPS
+ absl::memory
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ film_grain_test
+ SOURCES
+ ${libgav1_film_grain_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ memory_test
+ SOURCES
+ ${libgav1_memory_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ LIB_DEPS
+ absl::base
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ queue_test
+ SOURCES
+ ${libgav1_queue_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ segmentation_map_test
+ SOURCES
+ ${libgav1_segmentation_map_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ segmentation_test
+ SOURCES
+ ${libgav1_segmentation_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ stack_test
+ SOURCES
+ ${libgav1_stack_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ symbol_decoder_context_test
+ SOURCES
+ ${libgav1_symbol_decoder_context_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ threadpool_test
+ SOURCES
+ ${libgav1_threadpool_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ absl::synchronization
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ unbounded_queue_test
+ SOURCES
+ ${libgav1_unbounded_queue_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ tests_utils_test
+ SOURCES
+ ${libgav1_tests_utils_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ vector_test
+ SOURCES
+ ${libgav1_vector_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ version_test
+ SOURCES
+ ${libgav1_version_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ LIB_DEPS
+ ${libgav1_dependency}
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_tests_block_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_tests_block_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths})
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_tests_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_tests_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths})
+
+ libgav1_add_executable(TEST
+ NAME
+ average_blend_test
+ SOURCES
+ ${libgav1_average_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ buffer_pool_test
+ SOURCES
+ ${libgav1_buffer_pool_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ cdef_test
+ SOURCES
+ ${libgav1_cdef_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ convolve_test
+ SOURCES
+ ${libgav1_convolve_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ c_decoder_test
+ SOURCES
+ ${libgav1_c_decoder_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ ${libgav1_dependency})
+
+ libgav1_add_executable(TEST
+ NAME
+ c_version_test
+ SOURCES
+ ${libgav1_c_version_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ LIB_DEPS
+ ${libgav1_dependency})
+
+ libgav1_add_executable(TEST
+ NAME
+ decoder_test
+ SOURCES
+ ${libgav1_decoder_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ LIB_DEPS
+ ${libgav1_dependency}
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ decoder_buffer_test
+ SOURCES
+ ${libgav1_decoder_buffer_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ LIB_DEPS
+ ${libgav1_dependency}
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ distance_weighted_blend_test
+ SOURCES
+ ${libgav1_distance_weighted_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ dsp_test
+ SOURCES
+ ${libgav1_dsp_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ file_writer_test
+ SOURCES
+ ${libgav1_file_writer_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_file_writer
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::memory
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_cfl_test
+ SOURCES
+ ${libgav1_intrapred_cfl_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_directional_test
+ SOURCES
+ ${libgav1_intrapred_directional_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_filter_test
+ SOURCES
+ ${libgav1_intrapred_filter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_test
+ SOURCES
+ ${libgav1_intrapred_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intra_edge_test
+ SOURCES
+ ${libgav1_intra_edge_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_tests_utils
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ inverse_transform_test
+ SOURCES
+ ${libgav1_inverse_transform_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ internal_frame_buffer_list_test
+ SOURCES
+ ${libgav1_internal_frame_buffer_list_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ loop_filter_test
+ SOURCES
+ ${libgav1_loop_filter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ loop_restoration_test
+ SOURCES
+ ${libgav1_loop_restoration_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ mask_blend_test
+ SOURCES
+ ${libgav1_mask_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ motion_field_projection_test
+ SOURCES
+ ${libgav1_motion_field_projection_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ motion_vector_search_test
+ SOURCES
+ ${libgav1_motion_vector_search_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ obmc_test
+ SOURCES
+ ${libgav1_obmc_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ obu_parser_test
+ SOURCES
+ ${libgav1_obu_parser_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ post_filter_test
+ SOURCES
+ ${libgav1_post_filter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ prediction_mask_test
+ SOURCES
+ ${libgav1_prediction_mask_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ quantizer_test
+ SOURCES
+ ${libgav1_quantizer_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ raw_bit_reader_test
+ SOURCES
+ ${libgav1_raw_bit_reader_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ reconstruction_test
+ SOURCES
+ ${libgav1_reconstruction_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ ${libgav1_test_objlib_deps}
+ LIB_DEPS
+ absl::strings
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ residual_buffer_pool_test
+ SOURCES
+ ${libgav1_residual_buffer_pool_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ ${libgav1_test_objlib_deps}
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ scan_test
+ SOURCES
+ ${libgav1_scan_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_utils
+ ${libgav1_test_objlib_deps}
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ super_res_test
+ SOURCES
+ ${libgav1_super_res_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ threading_strategy_test
+ SOURCES
+ ${libgav1_threading_strategy_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ ${libgav1_test_objlib_deps}
+ LIB_DEPS
+ absl::str_format_internal
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ warp_test
+ SOURCES
+ ${libgav1_warp_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ warp_prediction_test
+ SOURCES
+ ${libgav1_warp_prediction_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ weight_mask_test
+ SOURCES
+ ${libgav1_weight_mask_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+endmacro()
--- /dev/null
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google, nor the WebM Project, nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+ ACMRandom() : random_(DeterministicSeed()) {}
+
+ explicit ACMRandom(int seed) : random_(seed) {}
+
+ void Reset(int seed) { random_.Reseed(seed); }
+ uint16_t Rand16(void) {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ return (value >> 15) & 0xffff;
+ }
+
+ int32_t Rand20Signed(void) {
+ // Use 20 bits: values between 524287 and -524288.
+ const uint32_t value = random_.Generate(1048576);
+ return static_cast<int32_t>(value) - 524288;
+ }
+
+ int16_t Rand16Signed(void) {
+ // Use 16 bits: values between 32767 and -32768.
+ return static_cast<int16_t>(random_.Generate(65536));
+ }
+
+ int16_t Rand13Signed(void) {
+ // Use 13 bits: values between 4095 and -4096.
+ const uint32_t value = random_.Generate(8192);
+ return static_cast<int16_t>(value) - 4096;
+ }
+
+ int16_t Rand9Signed(void) {
+ // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+ const uint32_t value = random_.Generate(512);
+ return static_cast<int16_t>(value) - 256;
+ }
+
+ uint8_t Rand8(void) {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 23) & 0xff;
+ }
+
+ uint8_t Rand8Extremes(void) {
+ // Returns a random value near 0 or near 255, to better exercise
+ // saturation behavior.
+ const uint8_t r = Rand8();
+ return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+ }
+
+ uint32_t RandRange(const uint32_t range) {
+ // testing::internal::Random::Generate provides values in the range
+ // testing::internal::Random::kMaxRange.
+ assert(range <= testing::internal::Random::kMaxRange);
+ return random_.Generate(range);
+ }
+
+ int PseudoUniform(int range) { return random_.Generate(range); }
+
+ int operator()(int n) { return PseudoUniform(n); }
+
+ static constexpr int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+ testing::internal::Random random_;
+};
+
+} // namespace libvpx_test
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
--- /dev/null
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+ MD5() { MD5Init(&md5_); }
+
+ void Add(const uint8_t *data, size_t size) {
+ MD5Update(&md5_, data, static_cast<uint32_t>(size));
+ }
+
+ const char *Get(void) {
+ static const char hex[16] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ };
+ uint8_t tmp[16];
+ MD5Context ctx_tmp = md5_;
+
+ MD5Final(tmp, &ctx_tmp);
+ for (int i = 0; i < 16; i++) {
+ res_[i * 2 + 0] = hex[tmp[i] >> 4];
+ res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+ }
+ res_[32] = 0;
+
+ return res_;
+ }
+
+ protected:
+ char res_[33];
+ MD5Context md5_;
+};
+
+} // namespace libvpx_test
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
--- /dev/null
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+#include <cstring>
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+ md5byte *p;
+
+ /* Only swap bytes for big endian machines */
+ int i = 1;
+
+ if (*(char *)&i == 1) return;
+
+ p = (md5byte *)buf;
+
+ do {
+ *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+ ((unsigned)p[1] << 8 | p[0]);
+ p += 4;
+ } while (--words);
+}
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bytes[0] = 0;
+ ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+ UWORD32 t;
+
+ /* Update byte count */
+
+ t = ctx->bytes[0];
+
+ if ((ctx->bytes[0] = t + len) < t)
+ ctx->bytes[1]++; /* Carry from low to high */
+
+ t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+ if (t > len) {
+ memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+ return;
+ }
+
+ /* First chunk is an odd size */
+ memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += t;
+ len -= t;
+
+ /* Process data in 64-byte chunks */
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+ int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+ md5byte *p = (md5byte *)ctx->in + count;
+
+ /* Set the first char of padding to 0x80. There is always room. */
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 56 bytes (-8..55) */
+ count = 56 - 1 - count;
+
+ if (count < 0) { /* Padding forces an extra block */
+ memset(p, 0, count + 8);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ p = (md5byte *)ctx->in;
+ count = 56;
+ }
+
+ memset(p, 0, count);
+ byteSwap(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ctx->in[14] = ctx->bytes[0] << 3;
+ ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+ MD5Transform(ctx->buf, ctx->in);
+
+ byteSwap(ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+ UWORD32 const in[16]) {
+ UWORD32 a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
--- /dev/null
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+ UWORD32 buf[4];
+ UWORD32 bytes[2];
+ UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+int CloseFile(FILE* stream) { return fclose(stream); }
+
+bool ReadFileToString(absl::string_view file_name, std::string* const string) {
+ using FilePtr = std::unique_ptr<FILE, decltype(&CloseFile)>;
+ FilePtr file(fopen(std::string(file_name).c_str(), "rb"), &CloseFile);
+ if (file == nullptr) return false;
+
+ do {
+ int c = fgetc(file.get());
+ if (ferror(file.get()) != 0) return false;
+
+ if (c != EOF) {
+ string->append(1, static_cast<char>(c));
+ } else {
+ break;
+ }
+ } while (true);
+
+ return true;
+}
+
+} // namespace
+
+void ResetDspTable(const int bitdepth) {
+ dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memset(dsp, 0, sizeof(dsp::Dsp));
+}
+
+std::string GetMd5Sum(const void* bytes, size_t size) {
+ libvpx_test::MD5 md5;
+ md5.Add(static_cast<const uint8_t*>(bytes), size);
+ return md5.Get();
+}
+
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) {
+ libvpx_test::MD5 md5;
+ const Pixel* row = block;
+ for (int i = 0; i < height; ++i) {
+ md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel));
+ row += stride;
+ }
+ return md5.Get();
+}
+
+template std::string GetMd5Sum(const int8_t* block, int width, int height,
+ int stride);
+template std::string GetMd5Sum(const int16_t* block, int width, int height,
+ int stride);
+
+std::string GetMd5Sum(const DecoderBuffer& buffer) {
+ libvpx_test::MD5 md5;
+ const size_t pixel_size =
+ (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) {
+ const int height = buffer.displayed_height[plane];
+ const size_t width = buffer.displayed_width[plane] * pixel_size;
+ const int stride = buffer.stride[plane];
+ const uint8_t* plane_buffer = buffer.plane[plane];
+ for (int row = 0; row < height; ++row) {
+ md5.Add(plane_buffer, width);
+ plane_buffer += stride;
+ }
+ }
+ return md5.Get();
+}
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const void* data, size_t size,
+ absl::Duration elapsed_time) {
+ const std::string digest = test_utils::GetMd5Sum(data, size);
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ digest.c_str());
+ EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const Pixel* block, int width,
+ int height, int stride, absl::Duration elapsed_time) {
+ const std::string digest =
+ test_utils::GetMd5Sum(block, width, height, stride);
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ digest.c_str());
+ EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const int8_t* block,
+ int width, int height, int stride,
+ absl::Duration elapsed_time);
+template void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const int16_t* block,
+ int width, int height, int stride,
+ absl::Duration elapsed_time);
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const char actual_digest[],
+ absl::Duration elapsed_time) {
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ actual_digest);
+ EXPECT_STREQ(expected_digest, actual_digest);
+}
+
+namespace {
+
+std::string GetSourceDir() {
+#if defined(__ANDROID__)
+ // Test files must be manually supplied. This path is frequently
+ // available on development devices.
+ return std::string("/data/local/tmp/tests/data");
+#elif defined(LIBGAV1_FLAGS_SRCDIR)
+ return std::string(LIBGAV1_FLAGS_SRCDIR) + "/tests/data";
+#else
+ return std::string(".");
+#endif // defined(__ANDROID__)
+}
+
+std::string GetTempDir() {
+ const char* path = getenv("TMPDIR");
+ if (path == nullptr || path[0] == '\0') path = getenv("TEMP");
+ if (path != nullptr && path[0] != '\0') return std::string(path);
+
+#if defined(__ANDROID__)
+ return std::string("/data/local/tmp");
+#elif defined(LIBGAV1_FLAGS_TMPDIR)
+ return std::string(LIBGAV1_FLAGS_TMPDIR);
+#else
+ return std::string(".");
+#endif // defined(__ANDROID__)
+}
+
+} // namespace
+
+std::string GetTestInputFilePath(absl::string_view file_name) {
+ const char* const path = getenv("LIBGAV1_TEST_DATA_PATH");
+ if (path != nullptr && path[0] != '\0') {
+ return std::string(path) + "/" + std::string(file_name);
+ }
+ return GetSourceDir() + "/" + std::string(file_name);
+}
+
+std::string GetTestOutputFilePath(absl::string_view file_name) {
+ return GetTempDir() + "/" + std::string(file_name);
+}
+
+void GetTestData(absl::string_view file_name, const bool is_output_file,
+ std::string* const output) {
+ ASSERT_NE(output, nullptr);
+ const std::string absolute_file_path = is_output_file
+ ? GetTestOutputFilePath(file_name)
+ : GetTestInputFilePath(file_name);
+
+ ASSERT_TRUE(ReadFileToString(absolute_file_path, output));
+}
+
+} // namespace test_utils
+} // namespace libgav1
--- /dev/null
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_UTILS_H_
+#define LIBGAV1_TESTS_UTILS_H_
+
+#include <cstddef>
+#include <new>
+#include <string>
+
+#include "absl/base/config.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+
+enum { kAlternateDeterministicSeed = 0x9571 };
+static_assert(kAlternateDeterministicSeed !=
+ libvpx_test::ACMRandom::DeterministicSeed(),
+ "");
+
+// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
+// of new to support googletest allocations.
+// Note when building the source as C++17 or greater, gcc 11.2.0 may issue a
+// warning of the form:
+// warning: 'void operator delete [](void*, std::align_val_t)' called on
+// pointer returned from a mismatched allocation function
+// note: returned from 'static void*
+// libgav1::test_utils::MaxAlignedAllocable::operator new [](size_t)'
+// This is a false positive as this function calls
+// libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow) which in
+// turn calls
+// void* operator new[](std::size_t, std::align_val_t, const std::nothrow_t&).
+// This is due to unbalanced inlining of the functions, so we force them to be
+// inlined.
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103993
+struct MaxAlignedAllocable {
+ // Class-specific allocation functions.
+ static LIBGAV1_ALWAYS_INLINE void* operator new(size_t size) {
+ void* const p =
+ libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+ if (p == nullptr) throw std::bad_alloc();
+#endif
+ return p;
+ }
+ static LIBGAV1_ALWAYS_INLINE void* operator new[](size_t size) {
+ void* const p =
+ libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+ if (p == nullptr) throw std::bad_alloc();
+#endif
+ return p;
+ }
+
+ // Class-specific non-throwing allocation functions
+ static LIBGAV1_ALWAYS_INLINE void* operator new(
+ size_t size, const std::nothrow_t& tag) noexcept {
+ return libgav1::MaxAlignedAllocable::operator new(size, tag);
+ }
+ static LIBGAV1_ALWAYS_INLINE void* operator new[](
+ size_t size, const std::nothrow_t& tag) noexcept {
+ return libgav1::MaxAlignedAllocable::operator new[](size, tag);
+ }
+
+ // Class-specific deallocation functions.
+ static LIBGAV1_ALWAYS_INLINE void operator delete(void* ptr) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete(ptr);
+ }
+ static LIBGAV1_ALWAYS_INLINE void operator delete[](void* ptr) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete[](ptr);
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static LIBGAV1_ALWAYS_INLINE void operator delete(
+ void* ptr, const std::nothrow_t& tag) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static LIBGAV1_ALWAYS_INLINE void operator delete[](
+ void* ptr, const std::nothrow_t& tag) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
+ }
+};
+
+// Clears dsp table entries for |bitdepth|. This function is not thread safe.
+void ResetDspTable(int bitdepth);
+
+//------------------------------------------------------------------------------
+// Gets human readable hexadecimal encoded MD5 sum from given data, block, or
+// frame buffer.
+
+std::string GetMd5Sum(const void* bytes, size_t size);
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride);
+std::string GetMd5Sum(const DecoderBuffer& buffer);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |size| bytes of |data| with |expected_digest|.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const void* data, size_t size,
+ absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |block| with |expected_digest|. The width, height,
+// and stride of |block| are |width|, |height|, and |stride|, respectively.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const Pixel* block, int width,
+ int height, int stride, absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares |actual_digest| with |expected_digest|. Prints a log message with
+// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and
+// |function_name| are merely tags used for logging and can be any meaningful
+// string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const char actual_digest[],
+ absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Reads the test data from |file_name| as a string into |output|. The
+// |is_output_file| argument controls the expansion of |file_name| to its full
+// path. When |is_output_file| is true GetTestData() reads from
+// utils.cc::GetTempDir(), and when it is false the file is read from
+// utils.cc::GetSourceDir().
+void GetTestData(absl::string_view file_name, bool is_output_file,
+ std::string* output);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| from libgav1/tests/data.
+std::string GetTestInputFilePath(absl::string_view file_name);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| in a location where the file can be
+// opened for writing.
+std::string GetTestOutputFilePath(absl::string_view file_name);
+
+} // namespace test_utils
+} // namespace libgav1
+
+#endif // LIBGAV1_TESTS_UTILS_H_
--- /dev/null
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+#include "src/utils/memory.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+// Has a trivial default constructor that performs no action.
+struct SmallMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x;
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x = 0;
+};
+
+// Has a trivial default constructor that performs no action.
+struct HugeMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {};
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+ MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+ uint8_t x;
+};
+#endif
+
+TEST(TestUtilsTest, TestMaxAlignedAllocable) {
+ {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+ EXPECT_NE(small, nullptr);
+ // Note this check doesn't guarantee conformance as a suitably aligned
+ // address may be returned from any allocator.
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new is called.
+ std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned);
+ EXPECT_NE(small, nullptr);
+ // Note this check doesn't guarantee conformance as a suitably aligned
+ // address may be returned from any allocator.
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+ new (std::nothrow) SmallMaxAligned[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+ (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete[] is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] is called.
+ std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+ new SmallMaxAligned[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+ (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete[] is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+ EXPECT_EQ(huge, nullptr);
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+ new (std::nothrow)
+ SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+ EXPECT_EQ(huge_array_of_smalls, nullptr);
+ }
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+ try {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete is called.
+ auto* always = new MaxAlignedThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete[] is called.
+ auto* always = new MaxAlignedThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ // Note these calls are only safe with exceptions enabled as if the throwing
+ // operator new returns the object is expected to be valid. In this case an
+ // attempt to invoke the object's constructor on a nullptr may be made which
+ // is undefined behavior.
+ try {
+ // MaxAlignedAllocable::operator new is called.
+ std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge(
+ new HugeMaxAlignedNontrivialConstructor);
+ ADD_FAILURE() << "huge allocation should fail.";
+ } catch (...) {
+ SUCCEED();
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] is called.
+ std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]>
+ huge_array_of_smalls(
+ new SmallMaxAlignedNontrivialConstructor
+ [kMaxAllocableSize /
+ sizeof(SmallMaxAlignedNontrivialConstructor) +
+ 1]);
+ ADD_FAILURE() << "huge_array_of_smalls allocation should fail.";
+ } catch (...) {
+ SUCCEED();
+ }
+#endif // ABSL_HAVE_EXCEPTIONS
+}
+
+} // namespace
+} // namespace test_utils
+} // namespace libgav1